LLVM 12.0.0
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Triple.h"
31#include "llvm/ADT/Twine.h"
47#include "llvm/IR/Attributes.h"
48#include "llvm/IR/Constants.h"
49#include "llvm/IR/DataLayout.h"
50#include "llvm/IR/DebugLoc.h"
52#include "llvm/IR/Function.h"
54#include "llvm/IR/GlobalValue.h"
55#include "llvm/IR/IRBuilder.h"
56#include "llvm/IR/Instruction.h"
59#include "llvm/IR/Intrinsics.h"
60#include "llvm/IR/IntrinsicsAArch64.h"
61#include "llvm/IR/Module.h"
64#include "llvm/IR/Type.h"
65#include "llvm/IR/Use.h"
66#include "llvm/IR/Value.h"
72#include "llvm/Support/Debug.h"
80#include <algorithm>
81#include <bitset>
82#include <cassert>
83#include <cctype>
84#include <cstdint>
85#include <cstdlib>
86#include <iterator>
87#include <limits>
88#include <tuple>
89#include <utility>
90#include <vector>
91
92using namespace llvm;
93using namespace llvm::PatternMatch;
94
95#define DEBUG_TYPE "aarch64-lower"
96
97STATISTIC(NumTailCalls, "Number of tail calls");
98STATISTIC(NumShiftInserts, "Number of vector shift inserts");
99STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
100
101// FIXME: The necessary dtprel relocations don't seem to be supported
102// well in the GNU bfd and gold linkers at the moment. Therefore, by
103// default, for now, fall back to GeneralDynamic code generation.
105 "aarch64-elf-ldtls-generation", cl::Hidden,
106 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
107 cl::init(false));
108
109static cl::opt<bool>
110EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
111 cl::desc("Enable AArch64 logical imm instruction "
112 "optimization"),
113 cl::init(true));
114
115// Temporary option added for the purpose of testing functionality added
116// to DAGCombiner.cpp in D92230. It is expected that this can be removed
117// in future when both implementations will be based off MGATHER rather
118// than the GLD1 nodes added for the SVE gather load intrinsics.
119static cl::opt<bool>
120EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
121 cl::desc("Combine extends of AArch64 masked "
122 "gather intrinsics"),
123 cl::init(true));
124
125/// Value type used for condition codes.
126static const MVT MVT_CC = MVT::i32;
127
128static inline EVT getPackedSVEVectorVT(EVT VT) {
129 switch (VT.getSimpleVT().SimpleTy) {
130 default:
131 llvm_unreachable("unexpected element type for vector");
132 case MVT::i8:
133 return MVT::nxv16i8;
134 case MVT::i16:
135 return MVT::nxv8i16;
136 case MVT::i32:
137 return MVT::nxv4i32;
138 case MVT::i64:
139 return MVT::nxv2i64;
140 case MVT::f16:
141 return MVT::nxv8f16;
142 case MVT::f32:
143 return MVT::nxv4f32;
144 case MVT::f64:
145 return MVT::nxv2f64;
146 case MVT::bf16:
147 return MVT::nxv8bf16;
148 }
149}
150
151// NOTE: Currently there's only a need to return integer vector types. If this
152// changes then just add an extra "type" parameter.
154 switch (EC.getKnownMinValue()) {
155 default:
156 llvm_unreachable("unexpected element count for vector");
157 case 16:
158 return MVT::nxv16i8;
159 case 8:
160 return MVT::nxv8i16;
161 case 4:
162 return MVT::nxv4i32;
163 case 2:
164 return MVT::nxv2i64;
165 }
166}
167
170 "Expected scalable predicate vector type!");
171 switch (VT.getVectorMinNumElements()) {
172 default:
173 llvm_unreachable("unexpected element count for vector");
174 case 2:
175 return MVT::nxv2i64;
176 case 4:
177 return MVT::nxv4i32;
178 case 8:
179 return MVT::nxv8i16;
180 case 16:
181 return MVT::nxv16i8;
182 }
183}
184
185/// Returns true if VT's elements occupy the lowest bit positions of its
186/// associated register class without any intervening space.
187///
188/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
189/// same register class, but only nxv8f16 can be treated as a packed vector.
190static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
192 "Expected legal vector type!");
193 return VT.isFixedLengthVector() ||
195}
196
197// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
198// predicate and end with a passthru value matching the result type.
232
234 const AArch64Subtarget &STI)
235 : TargetLowering(TM), Subtarget(&STI) {
236 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
237 // we have to make something up. Arbitrarily, choose ZeroOrOne.
239 // When comparing vectors the result sets the different elements in the
240 // vector to all-one or all-zero.
242
243 // Set up the register classes.
244 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
245 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
246
247 if (Subtarget->hasFPARMv8()) {
248 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
249 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
250 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
251 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
252 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
253 }
254
255 if (Subtarget->hasNEON()) {
256 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
257 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
258 // Someone set us up the NEON.
259 addDRTypeForNEON(MVT::v2f32);
260 addDRTypeForNEON(MVT::v8i8);
261 addDRTypeForNEON(MVT::v4i16);
262 addDRTypeForNEON(MVT::v2i32);
263 addDRTypeForNEON(MVT::v1i64);
264 addDRTypeForNEON(MVT::v1f64);
265 addDRTypeForNEON(MVT::v4f16);
266 if (Subtarget->hasBF16())
267 addDRTypeForNEON(MVT::v4bf16);
268
269 addQRTypeForNEON(MVT::v4f32);
270 addQRTypeForNEON(MVT::v2f64);
271 addQRTypeForNEON(MVT::v16i8);
272 addQRTypeForNEON(MVT::v8i16);
273 addQRTypeForNEON(MVT::v4i32);
274 addQRTypeForNEON(MVT::v2i64);
275 addQRTypeForNEON(MVT::v8f16);
276 if (Subtarget->hasBF16())
277 addQRTypeForNEON(MVT::v8bf16);
278 }
279
280 if (Subtarget->hasSVE()) {
281 // Add legal sve predicate types
282 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
283 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
284 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
285 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
286
287 // Add legal sve data types
288 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
289 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
290 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
291 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
292
293 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
294 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
295 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
296 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
297 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
298 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
299
300 if (Subtarget->hasBF16()) {
301 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
302 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
303 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
304 }
305
306 if (Subtarget->useSVEForFixedLengthVectors()) {
308 if (useSVEForFixedLengthVectorVT(VT))
309 addRegisterClass(VT, &AArch64::ZPRRegClass);
310
312 if (useSVEForFixedLengthVectorVT(VT))
313 addRegisterClass(VT, &AArch64::ZPRRegClass);
314 }
315
316 for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
325 }
326
327 for (auto VT :
331
332 for (auto VT :
334 MVT::nxv2f64 }) {
346 }
347 }
348
349 // Compute derived properties from the register classes
351
352 // Provide all sorts of operation actions
386
390
394
396
397 // Custom lowering hooks are needed for XOR
398 // to fold it into CSINC/CSINV.
401
402 // Virtually no operation on f128 is legal, but LLVM can't expand them when
403 // there's a valid register class, so we need custom operations in most cases.
427
428 // Lowering for many of the conversions is actually specified by the non-f128
429 // type. The LowerXXX function will be trivial when f128 isn't involved.
460
461 // Variable arguments.
466
467 // Variable-sized objects.
470
471 if (Subtarget->isTargetWindows())
473 else
475
476 // Constant pool entries
478
479 // BlockAddress
481
482 // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
491
492 // AArch64 lacks both left-rotate and popcount instructions.
498 }
499
500 // AArch64 doesn't have i32 MULH{S|U}.
503
504 // AArch64 doesn't have {U|S}MUL_LOHI.
507
511
514
520 }
527
528 // Custom lower Add/Sub/Mul with overflow.
541
550 if (Subtarget->hasFullFP16())
552 else
554
588
589 if (!Subtarget->hasFullFP16()) {
612
613 // promote v4f16 to v4f32 when that is known to be safe.
622
638
659 }
660
661 // AArch64 has implementations of a lot of rounding-like FP operations.
662 for (MVT Ty : {MVT::f32, MVT::f64}) {
677 }
678
679 if (Subtarget->hasFullFP16()) {
690 }
691
693
695
701
702 // Generate outline atomics library calls only if LSE was not specified for
703 // subtarget
704 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
730#define LCALLNAMES(A, B, N) \
731 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
732 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
733 setLibcallName(A##N##_REL, #B #N "_rel"); \
734 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
735#define LCALLNAME4(A, B) \
736 LCALLNAMES(A, B, 1) \
737 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
738#define LCALLNAME5(A, B) \
739 LCALLNAMES(A, B, 1) \
740 LCALLNAMES(A, B, 2) \
741 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
742 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
743 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
744 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
745 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
746 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
747 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
748#undef LCALLNAMES
749#undef LCALLNAME4
750#undef LCALLNAME5
751 }
752
753 // 128-bit loads and stores can be done without expanding
756
757 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
758 // custom lowering, as there are no un-paired non-temporal stores and
759 // legalization will break up 256 bit inputs.
767
768 // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
769 // This requires the Performance Monitors extension.
770 if (Subtarget->hasPerfMon())
772
773 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
774 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
775 // Issue __sincos_stret if available.
778 } else {
781 }
782
783 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
784 // MSVCRT doesn't have powi; fall back to pow
785 setLibcallName(RTLIB::POWI_F32, nullptr);
786 setLibcallName(RTLIB::POWI_F64, nullptr);
787 }
788
789 // Make floating-point constants legal for the large code model, so they don't
790 // become loads from the constant pool.
791 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
794 }
795
796 // AArch64 does not have floating-point extending loads, i1 sign-extending
797 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
798 for (MVT VT : MVT::fp_valuetypes()) {
803 }
804 for (MVT VT : MVT::integer_valuetypes())
806
814
818
819 // Indexed loads and stores are supported.
820 for (unsigned im = (unsigned)ISD::PRE_INC;
838 }
839
840 // Trap.
844
845 // We combine OR nodes for bitfield operations.
847 // Try to create BICs for vector ANDs.
849
850 // Vector add and sub nodes may conceal a high-half opportunity.
851 // Also, try to fold ADD into CSINC/CSINV..
859
863
865
873 if (Subtarget->supportsAddressTopByteIgnored())
875
878
880
883
889
891
892 // In case of strict alignment, avoid an excessive number of byte wide stores.
896
901
903
907
909
911
913
914 // Set required alignment.
916 // Set preferred alignments.
919
920 // Only change the limit for entries in a jump table if specified by
921 // the sub target, but not at the command line.
922 unsigned MaxJT = STI.getMaximumJumpTableSize();
925
927
929
930 if (Subtarget->hasNEON()) {
931 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
932 // silliness like this:
958
964
966
967 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
968 // elements smaller than i32, so promote the input to i32 first.
971 // i8 vector elements also need promotion to i32 for v8i8
974 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
979 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
980 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
983
984 if (Subtarget->hasFullFP16()) {
989 } else {
990 // when AArch64 doesn't have fullfp16 support, promote the input
991 // to i32 first.
996 }
997
1000
1001 // AArch64 doesn't have MUL.2d:
1003 // Custom handling for some quad-vector types to detect MULL.
1007
1008 // Saturates
1009 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1015 }
1016
1017 // Vector reductions
1018 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1020 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1023
1025 }
1026 }
1027 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1034 }
1036
1039 // Likewise, narrowing and extending vector loads/stores aren't handled
1040 // directly.
1043
1044 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1047 } else {
1050 }
1053
1056
1062 }
1063 }
1064
1065 // AArch64 has implementations of a lot of rounding-like FP operations.
1066 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
1073 }
1074
1075 if (Subtarget->hasFullFP16()) {
1076 for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
1083 }
1084 }
1085
1086 if (Subtarget->hasSVE())
1088
1090 }
1091
1092 if (Subtarget->hasSVE()) {
1093 // FIXME: Add custom lowering of MLOAD to handle different passthrus (not a
1094 // splat of 0 or undef) once vector selects supported in SVE codegen. See
1095 // D68877 for more details.
1096 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1130 }
1131
1132 // Illegal unpacked integer vector types.
1133 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1136 }
1137
1138 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1147
1148 // There are no legal MVT::nxv16f## based types.
1149 if (VT != MVT::nxv16i1) {
1152 }
1153 }
1154
1186 }
1187
1188 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1192 }
1193
1195
1198
1199 // NOTE: Currently this has to happen after computeRegisterProperties rather
1200 // than the preferred option of combining it with the addRegisterClass call.
1201 if (Subtarget->useSVEForFixedLengthVectors()) {
1203 if (useSVEForFixedLengthVectorVT(VT))
1204 addTypeForFixedLengthSVE(VT);
1206 if (useSVEForFixedLengthVectorVT(VT))
1207 addTypeForFixedLengthSVE(VT);
1208
1209 // 64bit results can mean a bigger than NEON input.
1210 for (auto VT : {MVT::v8i8, MVT::v4i16})
1213
1214 // 128bit results imply a bigger than NEON input.
1215 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1217 for (auto VT : {MVT::v8f16, MVT::v4f32})
1219
1220 // These operations are not supported on NEON but SVE can do them.
1255
1256 // Int operations with no NEON support.
1257 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1264 }
1265
1266 // FP operations with no NEON support.
1267 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32,
1270
1271 // Use SVE for vectors with more than 2 elements.
1272 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1274 }
1275 }
1276
1278}
1279
1280void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
1281 assert(VT.isVector() && "VT should be a vector type");
1282
1283 if (VT.isFloatingPoint()) {
1287 }
1288
1289 // Mark vector float intrinsics as expand.
1290 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1299
1300 // But we do support custom-lowering for FCOPYSIGN.
1302 }
1303
1315
1319 for (MVT InnerVT : MVT::all_valuetypes())
1321
1322 // CNT supports only B element sizes, then use UADDLP to widen.
1323 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1325
1331
1334
1335 if (!VT.isFloatingPoint())
1337
1338 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1339 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1340 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1341 setOperationAction(Opcode, VT, Legal);
1342
1343 // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
1344 if (VT.isFloatingPoint() &&
1346 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1347 for (unsigned Opcode :
1349 setOperationAction(Opcode, VT, Legal);
1350
1351 if (Subtarget->isLittleEndian()) {
1352 for (unsigned im = (unsigned)ISD::PRE_INC;
1356 }
1357 }
1358}
1359
1360void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1361 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1362
1363 // By default everything must be expanded.
1364 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1365 setOperationAction(Op, VT, Expand);
1366
1367 // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1369
1370 // Lower fixed length vector operations to scalable equivalents.
1429}
1430
1431void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1432 addRegisterClass(VT, &AArch64::FPR64RegClass);
1433 addTypeForNEON(VT, MVT::v2i32);
1434}
1435
1436void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1437 addRegisterClass(VT, &AArch64::FPR128RegClass);
1438 addTypeForNEON(VT, MVT::v4i32);
1439}
1440
1442 LLVMContext &C, EVT VT) const {
1443 if (!VT.isVector())
1444 return MVT::i32;
1445 if (VT.isScalableVector())
1448}
1449
1450static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1451 const APInt &Demanded,
1453 unsigned NewOpc) {
1454 uint64_t OldImm = Imm, NewImm, Enc;
1455 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1456
1457 // Return if the immediate is already all zeros, all ones, a bimm32 or a
1458 // bimm64.
1459 if (Imm == 0 || Imm == Mask ||
1461 return false;
1462
1463 unsigned EltSize = Size;
1464 uint64_t DemandedBits = Demanded.getZExtValue();
1465
1466 // Clear bits that are not demanded.
1467 Imm &= DemandedBits;
1468
1469 while (true) {
1470 // The goal here is to set the non-demanded bits in a way that minimizes
1471 // the number of switching between 0 and 1. In order to achieve this goal,
1472 // we set the non-demanded bits to the value of the preceding demanded bits.
1473 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1474 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1475 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1476 // The final result is 0b11000011.
1477 uint64_t NonDemandedBits = ~DemandedBits;
1478 uint64_t InvertedImm = ~Imm & DemandedBits;
1479 uint64_t RotatedImm =
1480 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
1482 uint64_t Sum = RotatedImm + NonDemandedBits;
1483 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
1484 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
1485 NewImm = (Imm | Ones) & Mask;
1486
1487 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1488 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1489 // we halve the element size and continue the search.
1490 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
1491 break;
1492
1493 // We cannot shrink the element size any further if it is 2-bits.
1494 if (EltSize == 2)
1495 return false;
1496
1497 EltSize /= 2;
1498 Mask >>= EltSize;
1499 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
1500
1501 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1502 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
1503 return false;
1504
1505 // Merge the upper and lower halves of Imm and DemandedBits.
1506 Imm |= Hi;
1508 }
1509
1511
1512 // Replicate the element across the register width.
1513 while (EltSize < Size) {
1514 NewImm |= NewImm << EltSize;
1515 EltSize *= 2;
1516 }
1517
1518 (void)OldImm;
1519 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
1520 "demanded bits should never be altered");
1521 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
1522
1523 // Create the new constant immediate node.
1524 EVT VT = Op.getValueType();
1525 SDLoc DL(Op);
1526 SDValue New;
1527
1528 // If the new constant immediate is all-zeros or all-ones, let the target
1529 // independent DAG combine optimize this node.
1530 if (NewImm == 0 || NewImm == OrigMask) {
1531 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
1532 TLO.DAG.getConstant(NewImm, DL, VT));
1533 // Otherwise, create a machine node so that target independent DAG combine
1534 // doesn't undo this optimization.
1535 } else {
1537 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
1538 New = SDValue(
1539 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1540 }
1541
1542 return TLO.CombineTo(Op, New);
1543}
1544
1546 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1547 TargetLoweringOpt &TLO) const {
1548 // Delay this optimization to as late as possible.
1549 if (!TLO.LegalOps)
1550 return false;
1551
1553 return false;
1554
1555 EVT VT = Op.getValueType();
1556 if (VT.isVector())
1557 return false;
1558
1559 unsigned Size = VT.getSizeInBits();
1560 assert((Size == 32 || Size == 64) &&
1561 "i32 or i64 is expected after legalization.");
1562
1563 // Exit early if we demand all bits.
1564 if (DemandedBits.countPopulation() == Size)
1565 return false;
1566
1567 unsigned NewOpc;
1568 switch (Op.getOpcode()) {
1569 default:
1570 return false;
1571 case ISD::AND:
1572 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1573 break;
1574 case ISD::OR:
1575 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1576 break;
1577 case ISD::XOR:
1578 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1579 break;
1580 }
1581 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
1582 if (!C)
1583 return false;
1584 uint64_t Imm = C->getZExtValue();
1585 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
1586}
1587
1588/// computeKnownBitsForTargetNode - Determine which of the bits specified in
1589/// Mask are known to be either zero or one and return them Known.
1591 const SDValue Op, KnownBits &Known,
1592 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1593 switch (Op.getOpcode()) {
1594 default:
1595 break;
1596 case AArch64ISD::CSEL: {
1598 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1599 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1600 Known = KnownBits::commonBits(Known, Known2);
1601 break;
1602 }
1604 case AArch64ISD::ADDlow: {
1605 if (!Subtarget->isTargetILP32())
1606 break;
1607 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1608 Known.Zero = APInt::getHighBitsSet(64, 32);
1609 break;
1610 }
1612 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1613 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1614 switch (IntID) {
1615 default: return;
1616 case Intrinsic::aarch64_ldaxr:
1617 case Intrinsic::aarch64_ldxr: {
1618 unsigned BitWidth = Known.getBitWidth();
1619 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1620 unsigned MemBits = VT.getScalarSizeInBits();
1622 return;
1623 }
1624 }
1625 break;
1626 }
1628 case ISD::INTRINSIC_VOID: {
1629 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1630 switch (IntNo) {
1631 default:
1632 break;
1633 case Intrinsic::aarch64_neon_umaxv:
1634 case Intrinsic::aarch64_neon_uminv: {
1635 // Figure out the datatype of the vector operand. The UMINV instruction
1636 // will zero extend the result, so we can mark as known zero all the
1637 // bits larger than the element datatype. 32-bit or larget doesn't need
1638 // this as those are legal types and will be handled by isel directly.
1639 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1640 unsigned BitWidth = Known.getBitWidth();
1641 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1642 assert(BitWidth >= 8 && "Unexpected width!");
1644 Known.Zero |= Mask;
1645 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1646 assert(BitWidth >= 16 && "Unexpected width!");
1648 Known.Zero |= Mask;
1649 }
1650 break;
1651 } break;
1652 }
1653 }
1654 }
1655}
1656
1661
1663 EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
1664 bool *Fast) const {
1665 if (Subtarget->requiresStrictAlign())
1666 return false;
1667
1668 if (Fast) {
1669 // Some CPUs are fine with unaligned stores except for 128-bit ones.
1670 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1671 // See comments in performSTORECombine() for more details about
1672 // these conditions.
1673
1674 // Code that uses clang vector extensions can mark that it
1675 // wants unaligned accesses to be treated as fast by
1676 // underspecifying alignment to be 1 or 2.
1677 Align <= 2 ||
1678
1679 // Disregard v2i64. Memcpy lowering produces those and splitting
1680 // them regresses performance on micro-benchmarks and olden/bh.
1681 VT == MVT::v2i64;
1682 }
1683 return true;
1684}
1685
1686// Same as above but handling LLTs instead.
1688 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1689 bool *Fast) const {
1690 if (Subtarget->requiresStrictAlign())
1691 return false;
1692
1693 if (Fast) {
1694 // Some CPUs are fine with unaligned stores except for 128-bit ones.
1695 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
1696 Ty.getSizeInBytes() != 16 ||
1697 // See comments in performSTORECombine() for more details about
1698 // these conditions.
1699
1700 // Code that uses clang vector extensions can mark that it
1701 // wants unaligned accesses to be treated as fast by
1702 // underspecifying alignment to be 1 or 2.
1703 Alignment <= 2 ||
1704
1705 // Disregard v2i64. Memcpy lowering produces those and splitting
1706 // them regresses performance on micro-benchmarks and olden/bh.
1707 Ty == LLT::vector(2, 64);
1708 }
1709 return true;
1710}
1711
1712FastISel *
1717
1718const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1719#define MAKE_CASE(V) \
1720 case V: \
1721 return #V;
1722 switch ((AArch64ISD::NodeType)Opcode) {
1724 break;
1995 }
1996#undef MAKE_CASE
1997 return nullptr;
1998}
1999
2002 MachineBasicBlock *MBB) const {
2003 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2004 // phi node:
2005
2006 // OrigBB:
2007 // [... previous instrs leading to comparison ...]
2008 // b.ne TrueBB
2009 // b EndBB
2010 // TrueBB:
2011 // ; Fallthrough
2012 // EndBB:
2013 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2014
2015 MachineFunction *MF = MBB->getParent();
2016 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2017 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2018 DebugLoc DL = MI.getDebugLoc();
2020
2021 Register DestReg = MI.getOperand(0).getReg();
2022 Register IfTrueReg = MI.getOperand(1).getReg();
2023 Register IfFalseReg = MI.getOperand(2).getReg();
2024 unsigned CondCode = MI.getOperand(3).getImm();
2025 bool NZCVKilled = MI.getOperand(4).isKill();
2026
2029 MF->insert(It, TrueBB);
2030 MF->insert(It, EndBB);
2031
2032 // Transfer rest of current basic-block to EndBB
2033 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2034 MBB->end());
2036
2037 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2038 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2039 MBB->addSuccessor(TrueBB);
2040 MBB->addSuccessor(EndBB);
2041
2042 // TrueBB falls through to the end.
2043 TrueBB->addSuccessor(EndBB);
2044
2045 if (!NZCVKilled) {
2046 TrueBB->addLiveIn(AArch64::NZCV);
2047 EndBB->addLiveIn(AArch64::NZCV);
2048 }
2049
2050 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2052 .addMBB(TrueBB)
2054 .addMBB(MBB);
2055
2056 MI.eraseFromParent();
2057 return EndBB;
2058}
2059
2067
2069 MachineInstr &MI, MachineBasicBlock *BB) const {
2070 switch (MI.getOpcode()) {
2071 default:
2072#ifndef NDEBUG
2073 MI.dump();
2074#endif
2075 llvm_unreachable("Unexpected instruction for custom inserter!");
2076
2077 case AArch64::F128CSEL:
2078 return EmitF128CSEL(MI, BB);
2079
2080 case TargetOpcode::STACKMAP:
2081 case TargetOpcode::PATCHPOINT:
2082 case TargetOpcode::STATEPOINT:
2083 return emitPatchPoint(MI, BB);
2084
2085 case AArch64::CATCHRET:
2086 return EmitLoweredCatchRet(MI, BB);
2087 }
2088}
2089
2090//===----------------------------------------------------------------------===//
2091// AArch64 Lowering private implementation.
2092//===----------------------------------------------------------------------===//
2093
2094//===----------------------------------------------------------------------===//
2095// Lowering Code
2096//===----------------------------------------------------------------------===//
2097
2098/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2099/// CC
2101 switch (CC) {
2102 default:
2103 llvm_unreachable("Unknown condition code!");
2104 case ISD::SETNE:
2105 return AArch64CC::NE;
2106 case ISD::SETEQ:
2107 return AArch64CC::EQ;
2108 case ISD::SETGT:
2109 return AArch64CC::GT;
2110 case ISD::SETGE:
2111 return AArch64CC::GE;
2112 case ISD::SETLT:
2113 return AArch64CC::LT;
2114 case ISD::SETLE:
2115 return AArch64CC::LE;
2116 case ISD::SETUGT:
2117 return AArch64CC::HI;
2118 case ISD::SETUGE:
2119 return AArch64CC::HS;
2120 case ISD::SETULT:
2121 return AArch64CC::LO;
2122 case ISD::SETULE:
2123 return AArch64CC::LS;
2124 }
2125}
2126
2127/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2129 AArch64CC::CondCode &CondCode,
2132 switch (CC) {
2133 default:
2134 llvm_unreachable("Unknown FP condition!");
2135 case ISD::SETEQ:
2136 case ISD::SETOEQ:
2137 CondCode = AArch64CC::EQ;
2138 break;
2139 case ISD::SETGT:
2140 case ISD::SETOGT:
2141 CondCode = AArch64CC::GT;
2142 break;
2143 case ISD::SETGE:
2144 case ISD::SETOGE:
2145 CondCode = AArch64CC::GE;
2146 break;
2147 case ISD::SETOLT:
2148 CondCode = AArch64CC::MI;
2149 break;
2150 case ISD::SETOLE:
2151 CondCode = AArch64CC::LS;
2152 break;
2153 case ISD::SETONE:
2154 CondCode = AArch64CC::MI;
2156 break;
2157 case ISD::SETO:
2158 CondCode = AArch64CC::VC;
2159 break;
2160 case ISD::SETUO:
2161 CondCode = AArch64CC::VS;
2162 break;
2163 case ISD::SETUEQ:
2164 CondCode = AArch64CC::EQ;
2166 break;
2167 case ISD::SETUGT:
2168 CondCode = AArch64CC::HI;
2169 break;
2170 case ISD::SETUGE:
2171 CondCode = AArch64CC::PL;
2172 break;
2173 case ISD::SETLT:
2174 case ISD::SETULT:
2175 CondCode = AArch64CC::LT;
2176 break;
2177 case ISD::SETLE:
2178 case ISD::SETULE:
2179 CondCode = AArch64CC::LE;
2180 break;
2181 case ISD::SETNE:
2182 case ISD::SETUNE:
2183 CondCode = AArch64CC::NE;
2184 break;
2185 }
2186}
2187
2188/// Convert a DAG fp condition code to an AArch64 CC.
2189/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2190/// should be AND'ed instead of OR'ed.
2192 AArch64CC::CondCode &CondCode,
2195 switch (CC) {
2196 default:
2197 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2199 break;
2200 case ISD::SETONE:
2201 // (a one b)
2202 // == ((a olt b) || (a ogt b))
2203 // == ((a ord b) && (a une b))
2204 CondCode = AArch64CC::VC;
2206 break;
2207 case ISD::SETUEQ:
2208 // (a ueq b)
2209 // == ((a uno b) || (a oeq b))
2210 // == ((a ule b) && (a uge b))
2211 CondCode = AArch64CC::PL;
2213 break;
2214 }
2215}
2216
2217/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2218/// CC usable with the vector instructions. Fewer operations are available
2219/// without a real NZCV register, so we have to use less efficient combinations
2220/// to get the same effect.
2222 AArch64CC::CondCode &CondCode,
2224 bool &Invert) {
2225 Invert = false;
2226 switch (CC) {
2227 default:
2228 // Mostly the scalar mappings work fine.
2229 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2230 break;
2231 case ISD::SETUO:
2232 Invert = true;
2234 case ISD::SETO:
2235 CondCode = AArch64CC::MI;
2237 break;
2238 case ISD::SETUEQ:
2239 case ISD::SETULT:
2240 case ISD::SETULE:
2241 case ISD::SETUGT:
2242 case ISD::SETUGE:
2243 // All of the compare-mask comparisons are ordered, but we can switch
2244 // between the two by a double inversion. E.g. ULE == !OGT.
2245 Invert = true;
2246 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
2247 CondCode, CondCode2);
2248 break;
2249 }
2250}
2251
2252static bool isLegalArithImmed(uint64_t C) {
2253 // Matches AArch64DAGToDAGISel::SelectArithImmed().
2254 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
2255 LLVM_DEBUG(dbgs() << "Is imm " << C
2256 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
2257 return IsLegal;
2258}
2259
2260// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
2261// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
2262// can be set differently by this operation. It comes down to whether
2263// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
2264// everything is fine. If not then the optimization is wrong. Thus general
2265// comparisons are only valid if op2 != 0.
2266//
2267// So, finally, the only LLVM-native comparisons that don't mention C and V
2268// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
2269// the absence of information about op2.
2270static bool isCMN(SDValue Op, ISD::CondCode CC) {
2271 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
2272 (CC == ISD::SETEQ || CC == ISD::SETNE);
2273}
2274
2276 SelectionDAG &DAG, SDValue Chain,
2277 bool IsSignaling) {
2278 EVT VT = LHS.getValueType();
2279 assert(VT != MVT::f128);
2280 assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
2281 unsigned Opcode =
2283 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
2284}
2285
2287 const SDLoc &dl, SelectionDAG &DAG) {
2288 EVT VT = LHS.getValueType();
2289 const bool FullFP16 =
2290 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2291
2292 if (VT.isFloatingPoint()) {
2293 assert(VT != MVT::f128);
2294 if (VT == MVT::f16 && !FullFP16) {
2295 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
2296 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
2297 VT = MVT::f32;
2298 }
2299 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
2300 }
2301
2302 // The CMP instruction is just an alias for SUBS, and representing it as
2303 // SUBS means that it's possible to get CSE with subtract operations.
2304 // A later phase can perform the optimization of setting the destination
2305 // register to WZR/XZR if it ends up being unused.
2306 unsigned Opcode = AArch64ISD::SUBS;
2307
2308 if (isCMN(RHS, CC)) {
2309 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
2310 Opcode = AArch64ISD::ADDS;
2311 RHS = RHS.getOperand(1);
2312 } else if (isCMN(LHS, CC)) {
2313 // As we are looking for EQ/NE compares, the operands can be commuted ; can
2314 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
2315 Opcode = AArch64ISD::ADDS;
2316 LHS = LHS.getOperand(1);
2317 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
2318 if (LHS.getOpcode() == ISD::AND) {
2319 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
2320 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
2321 // of the signed comparisons.
2322 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
2323 DAG.getVTList(VT, MVT_CC),
2324 LHS.getOperand(0),
2325 LHS.getOperand(1));
2326 // Replace all users of (and X, Y) with newly generated (ands X, Y)
2327 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
2328 return ANDSNode.getValue(1);
2329 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
2330 // Use result of ANDS
2331 return LHS.getValue(1);
2332 }
2333 }
2334
2335 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
2336 .getValue(1);
2337}
2338
2339/// \defgroup AArch64CCMP CMP;CCMP matching
2340///
2341/// These functions deal with the formation of CMP;CCMP;... sequences.
2342/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
2343/// a comparison. They set the NZCV flags to a predefined value if their
2344/// predicate is false. This allows to express arbitrary conjunctions, for
2345/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
2346/// expressed as:
2347/// cmp A
2348/// ccmp B, inv(CB), CA
2349/// check for CB flags
2350///
2351/// This naturally lets us implement chains of AND operations with SETCC
2352/// operands. And we can even implement some other situations by transforming
2353/// them:
2354/// - We can implement (NEG SETCC) i.e. negating a single comparison by
2355/// negating the flags used in a CCMP/FCCMP operations.
2356/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
2357/// by negating the flags we test for afterwards. i.e.
2358/// NEG (CMP CCMP CCCMP ...) can be implemented.
2359/// - Note that we can only ever negate all previously processed results.
2360/// What we can not implement by flipping the flags to test is a negation
2361/// of two sub-trees (because the negation affects all sub-trees emitted so
2362/// far, so the 2nd sub-tree we emit would also affect the first).
2363/// With those tools we can implement some OR operations:
2364/// - (OR (SETCC A) (SETCC B)) can be implemented via:
2365/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
2366/// - After transforming OR to NEG/AND combinations we may be able to use NEG
2367/// elimination rules from earlier to implement the whole thing as a
2368/// CCMP/FCCMP chain.
2369///
2370/// As complete example:
2371/// or (or (setCA (cmp A)) (setCB (cmp B)))
2372/// (and (setCC (cmp C)) (setCD (cmp D)))"
2373/// can be reassociated to:
2374/// or (and (setCC (cmp C)) setCD (cmp D))
2375// (or (setCA (cmp A)) (setCB (cmp B)))
2376/// can be transformed to:
2377/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
2378/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
2379/// which can be implemented as:
2380/// cmp C
2381/// ccmp D, inv(CD), CC
2382/// ccmp A, CA, inv(CD)
2383/// ccmp B, CB, inv(CA)
2384/// check for CB flags
2385///
2386/// A counterexample is "or (and A B) (and C D)" which translates to
2387/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
2388/// can only implement 1 of the inner (not) operations, but not both!
2389/// @{
2390
2391/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
2393 ISD::CondCode CC, SDValue CCOp,
2394 AArch64CC::CondCode Predicate,
2396 const SDLoc &DL, SelectionDAG &DAG) {
2397 unsigned Opcode = 0;
2398 const bool FullFP16 =
2399 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2400
2401 if (LHS.getValueType().isFloatingPoint()) {
2402 assert(LHS.getValueType() != MVT::f128);
2403 if (LHS.getValueType() == MVT::f16 && !FullFP16) {
2404 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
2405 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
2406 }
2407 Opcode = AArch64ISD::FCCMP;
2408 } else if (RHS.getOpcode() == ISD::SUB) {
2409 SDValue SubOp0 = RHS.getOperand(0);
2410 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
2411 // See emitComparison() on why we can only do this for SETEQ and SETNE.
2412 Opcode = AArch64ISD::CCMN;
2413 RHS = RHS.getOperand(1);
2414 }
2415 }
2416 if (Opcode == 0)
2417 Opcode = AArch64ISD::CCMP;
2418
2419 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
2423 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
2424}
2425
2426/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
2427/// expressed as a conjunction. See \ref AArch64CCMP.
2428/// \param CanNegate Set to true if we can negate the whole sub-tree just by
2429/// changing the conditions on the SETCC tests.
2430/// (this means we can call emitConjunctionRec() with
2431/// Negate==true on this sub-tree)
2432/// \param MustBeFirst Set to true if this subtree needs to be negated and we
2433/// cannot do the negation naturally. We are required to
2434/// emit the subtree first in this case.
2435/// \param WillNegate Is true if are called when the result of this
2436/// subexpression must be negated. This happens when the
2437/// outer expression is an OR. We can use this fact to know
2438/// that we have a double negation (or (or ...) ...) that
2439/// can be implemented for free.
2440static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
2441 bool &MustBeFirst, bool WillNegate,
2442 unsigned Depth = 0) {
2443 if (!Val.hasOneUse())
2444 return false;
2445 unsigned Opcode = Val->getOpcode();
2446 if (Opcode == ISD::SETCC) {
2447 if (Val->getOperand(0).getValueType() == MVT::f128)
2448 return false;
2449 CanNegate = true;
2450 MustBeFirst = false;
2451 return true;
2452 }
2453 // Protect against exponential runtime and stack overflow.
2454 if (Depth > 6)
2455 return false;
2456 if (Opcode == ISD::AND || Opcode == ISD::OR) {
2457 bool IsOR = Opcode == ISD::OR;
2458 SDValue O0 = Val->getOperand(0);
2459 SDValue O1 = Val->getOperand(1);
2460 bool CanNegateL;
2461 bool MustBeFirstL;
2463 return false;
2464 bool CanNegateR;
2465 bool MustBeFirstR;
2467 return false;
2468
2470 return false;
2471
2472 if (IsOR) {
2473 // For an OR expression we need to be able to naturally negate at least
2474 // one side or we cannot do the transformation at all.
2475 if (!CanNegateL && !CanNegateR)
2476 return false;
2477 // If we the result of the OR will be negated and we can naturally negate
2478 // the leafs, then this sub-tree as a whole negates naturally.
2480 // If we cannot naturally negate the whole sub-tree, then this must be
2481 // emitted first.
2483 } else {
2484 assert(Opcode == ISD::AND && "Must be OR or AND");
2485 // We cannot naturally negate an AND operation.
2486 CanNegate = false;
2488 }
2489 return true;
2490 }
2491 return false;
2492}
2493
2494/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
2495/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
2496/// Tries to transform the given i1 producing node @p Val to a series compare
2497/// and conditional compare operations. @returns an NZCV flags producing node
2498/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
2499/// transformation was not possible.
2500/// \p Negate is true if we want this sub-tree being negated just by changing
2501/// SETCC conditions.
2503 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
2504 AArch64CC::CondCode Predicate) {
2505 // We're at a tree leaf, produce a conditional comparison operation.
2506 unsigned Opcode = Val->getOpcode();
2507 if (Opcode == ISD::SETCC) {
2508 SDValue LHS = Val->getOperand(0);
2509 SDValue RHS = Val->getOperand(1);
2510 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
2511 bool isInteger = LHS.getValueType().isInteger();
2512 if (Negate)
2513 CC = getSetCCInverse(CC, LHS.getValueType());
2514 SDLoc DL(Val);
2515 // Determine OutCC and handle FP special case.
2516 if (isInteger) {
2518 } else {
2519 assert(LHS.getValueType().isFloatingPoint());
2522 // Some floating point conditions can't be tested with a single condition
2523 // code. Construct an additional comparison in this case.
2524 if (ExtraCC != AArch64CC::AL) {
2526 if (!CCOp.getNode())
2527 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
2528 else
2529 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
2530 ExtraCC, DL, DAG);
2531 CCOp = ExtraCmp;
2532 Predicate = ExtraCC;
2533 }
2534 }
2535
2536 // Produce a normal comparison if we are first in the chain
2537 if (!CCOp)
2538 return emitComparison(LHS, RHS, CC, DL, DAG);
2539 // Otherwise produce a ccmp.
2540 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
2541 DAG);
2542 }
2543 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
2544
2545 bool IsOR = Opcode == ISD::OR;
2546
2547 SDValue LHS = Val->getOperand(0);
2548 bool CanNegateL;
2549 bool MustBeFirstL;
2551 assert(ValidL && "Valid conjunction/disjunction tree");
2552 (void)ValidL;
2553
2554 SDValue RHS = Val->getOperand(1);
2555 bool CanNegateR;
2556 bool MustBeFirstR;
2558 assert(ValidR && "Valid conjunction/disjunction tree");
2559 (void)ValidR;
2560
2561 // Swap sub-tree that must come first to the right side.
2562 if (MustBeFirstL) {
2563 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
2564 std::swap(LHS, RHS);
2567 }
2568
2569 bool NegateR;
2570 bool NegateAfterR;
2571 bool NegateL;
2572 bool NegateAfterAll;
2573 if (Opcode == ISD::OR) {
2574 // Swap the sub-tree that we can negate naturally to the left.
2575 if (!CanNegateL) {
2576 assert(CanNegateR && "at least one side must be negatable");
2577 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
2578 assert(!Negate);
2579 std::swap(LHS, RHS);
2580 NegateR = false;
2581 NegateAfterR = true;
2582 } else {
2583 // Negate the left sub-tree if possible, otherwise negate the result.
2586 }
2587 NegateL = true;
2588 NegateAfterAll = !Negate;
2589 } else {
2590 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
2591 assert(!Negate && "Valid conjunction/disjunction tree");
2592
2593 NegateL = false;
2594 NegateR = false;
2595 NegateAfterR = false;
2596 NegateAfterAll = false;
2597 }
2598
2599 // Emit sub-trees.
2601 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
2602 if (NegateAfterR)
2605 if (NegateAfterAll)
2607 return CmpL;
2608}
2609
2610/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
2611/// In some cases this is even possible with OR operations in the expression.
2612/// See \ref AArch64CCMP.
2613/// \see emitConjunctionRec().
2616 bool DummyCanNegate;
2617 bool DummyMustBeFirst;
2619 return SDValue();
2620
2621 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
2622}
2623
2624/// @}
2625
2626/// Returns how profitable it is to fold a comparison's operand's shift and/or
2627/// extension operations.
2629 auto isSupportedExtend = [&](SDValue V) {
2630 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
2631 return true;
2632
2633 if (V.getOpcode() == ISD::AND)
2634 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
2635 uint64_t Mask = MaskCst->getZExtValue();
2636 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
2637 }
2638
2639 return false;
2640 };
2641
2642 if (!Op.hasOneUse())
2643 return 0;
2644
2645 if (isSupportedExtend(Op))
2646 return 1;
2647
2648 unsigned Opc = Op.getOpcode();
2649 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
2650 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
2651 uint64_t Shift = ShiftCst->getZExtValue();
2652 if (isSupportedExtend(Op.getOperand(0)))
2653 return (Shift <= 4) ? 2 : 1;
2654 EVT VT = Op.getValueType();
2655 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
2656 return 1;
2657 }
2658
2659 return 0;
2660}
2661
2664 const SDLoc &dl) {
2665 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
2666 EVT VT = RHS.getValueType();
2667 uint64_t C = RHSC->getZExtValue();
2668 if (!isLegalArithImmed(C)) {
2669 // Constant does not fit, try adjusting it by one?
2670 switch (CC) {
2671 default:
2672 break;
2673 case ISD::SETLT:
2674 case ISD::SETGE:
2675 if ((VT == MVT::i32 && C != 0x80000000 &&
2676 isLegalArithImmed((uint32_t)(C - 1))) ||
2677 (VT == MVT::i64 && C != 0x80000000ULL &&
2678 isLegalArithImmed(C - 1ULL))) {
2679 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
2680 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2681 RHS = DAG.getConstant(C, dl, VT);
2682 }
2683 break;
2684 case ISD::SETULT:
2685 case ISD::SETUGE:
2686 if ((VT == MVT::i32 && C != 0 &&
2687 isLegalArithImmed((uint32_t)(C - 1))) ||
2688 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
2689 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
2690 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2691 RHS = DAG.getConstant(C, dl, VT);
2692 }
2693 break;
2694 case ISD::SETLE:
2695 case ISD::SETGT:
2696 if ((VT == MVT::i32 && C != INT32_MAX &&
2697 isLegalArithImmed((uint32_t)(C + 1))) ||
2698 (VT == MVT::i64 && C != INT64_MAX &&
2699 isLegalArithImmed(C + 1ULL))) {
2700 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
2701 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2702 RHS = DAG.getConstant(C, dl, VT);
2703 }
2704 break;
2705 case ISD::SETULE:
2706 case ISD::SETUGT:
2707 if ((VT == MVT::i32 && C != UINT32_MAX &&
2708 isLegalArithImmed((uint32_t)(C + 1))) ||
2709 (VT == MVT::i64 && C != UINT64_MAX &&
2710 isLegalArithImmed(C + 1ULL))) {
2711 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
2712 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2713 RHS = DAG.getConstant(C, dl, VT);
2714 }
2715 break;
2716 }
2717 }
2718 }
2719
2720 // Comparisons are canonicalized so that the RHS operand is simpler than the
2721 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
2722 // can fold some shift+extend operations on the RHS operand, so swap the
2723 // operands if that can be done.
2724 //
2725 // For example:
2726 // lsl w13, w11, #1
2727 // cmp w13, w12
2728 // can be turned into:
2729 // cmp w12, w11, lsl #1
2730 if (!isa<ConstantSDNode>(RHS) ||
2731 !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
2732 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
2733
2735 std::swap(LHS, RHS);
2737 }
2738 }
2739
2740 SDValue Cmp;
2741 AArch64CC::CondCode AArch64CC;
2742 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
2744
2745 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
2746 // For the i8 operand, the largest immediate is 255, so this can be easily
2747 // encoded in the compare instruction. For the i16 operand, however, the
2748 // largest immediate cannot be encoded in the compare.
2749 // Therefore, use a sign extending load and cmn to avoid materializing the
2750 // -1 constant. For example,
2751 // movz w1, #65535
2752 // ldrh w0, [x0, #0]
2753 // cmp w0, w1
2754 // >
2755 // ldrsh w0, [x0, #0]
2756 // cmn w0, #1
2757 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
2758 // if and only if (sext LHS) == (sext RHS). The checks are in place to
2759 // ensure both the LHS and RHS are truly zero extended and to make sure the
2760 // transformation is profitable.
2761 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
2762 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
2763 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
2764 LHS.getNode()->hasNUsesOfValue(1, 0)) {
2765 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
2767 SDValue SExt =
2768 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
2769 DAG.getValueType(MVT::i16));
2770 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
2771 RHS.getValueType()),
2772 CC, dl, DAG);
2773 AArch64CC = changeIntCCToAArch64CC(CC);
2774 }
2775 }
2776
2777 if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
2778 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
2779 if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
2780 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
2781 }
2782 }
2783 }
2784
2785 if (!Cmp) {
2786 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
2787 AArch64CC = changeIntCCToAArch64CC(CC);
2788 }
2789 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
2790 return Cmp;
2791}
2792
2793static std::pair<SDValue, SDValue>
2795 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
2796 "Unsupported value type");
2798 SDLoc DL(Op);
2799 SDValue LHS = Op.getOperand(0);
2800 SDValue RHS = Op.getOperand(1);
2801 unsigned Opc = 0;
2802 switch (Op.getOpcode()) {
2803 default:
2804 llvm_unreachable("Unknown overflow instruction!");
2805 case ISD::SADDO:
2806 Opc = AArch64ISD::ADDS;
2807 CC = AArch64CC::VS;
2808 break;
2809 case ISD::UADDO:
2810 Opc = AArch64ISD::ADDS;
2811 CC = AArch64CC::HS;
2812 break;
2813 case ISD::SSUBO:
2814 Opc = AArch64ISD::SUBS;
2815 CC = AArch64CC::VS;
2816 break;
2817 case ISD::USUBO:
2818 Opc = AArch64ISD::SUBS;
2819 CC = AArch64CC::LO;
2820 break;
2821 // Multiply needs a little bit extra work.
2822 case ISD::SMULO:
2823 case ISD::UMULO: {
2824 CC = AArch64CC::NE;
2825 bool IsSigned = Op.getOpcode() == ISD::SMULO;
2826 if (Op.getValueType() == MVT::i32) {
2827 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2828 // For a 32 bit multiply with overflow check we want the instruction
2829 // selector to generate a widening multiply (SMADDL/UMADDL). For that we
2830 // need to generate the following pattern:
2831 // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
2832 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
2833 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
2834 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2836 DAG.getConstant(0, DL, MVT::i64));
2837 // On AArch64 the upper 32 bits are always zero extended for a 32 bit
2838 // operation. We need to clear out the upper 32 bits, because we used a
2839 // widening multiply that wrote all 64 bits. In the end this should be a
2840 // noop.
2842 if (IsSigned) {
2843 // The signed overflow check requires more than just a simple check for
2844 // any bit set in the upper 32 bits of the result. These bits could be
2845 // just the sign bits of a negative number. To perform the overflow
2846 // check we have to arithmetic shift right the 32nd bit of the result by
2847 // 31 bits. Then we compare the result to the upper 32 bits.
2849 DAG.getConstant(32, DL, MVT::i64));
2852 DAG.getConstant(31, DL, MVT::i64));
2853 // It is important that LowerBits is last, otherwise the arithmetic
2854 // shift will not be folded into the compare (SUBS).
2855 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
2857 .getValue(1);
2858 } else {
2859 // The overflow check for unsigned multiply is easy. We only need to
2860 // check if any of the upper 32 bits are set. This can be done with a
2861 // CMP (shifted register). For that we need to generate the following
2862 // pattern:
2863 // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
2865 DAG.getConstant(32, DL, MVT::i64));
2866 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2867 Overflow =
2868 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2869 DAG.getConstant(0, DL, MVT::i64),
2870 UpperBits).getValue(1);
2871 }
2872 break;
2873 }
2874 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
2875 // For the 64 bit multiply
2876 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2877 if (IsSigned) {
2878 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
2880 DAG.getConstant(63, DL, MVT::i64));
2881 // It is important that LowerBits is last, otherwise the arithmetic
2882 // shift will not be folded into the compare (SUBS).
2883 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2885 .getValue(1);
2886 } else {
2887 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
2888 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2889 Overflow =
2890 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2891 DAG.getConstant(0, DL, MVT::i64),
2892 UpperBits).getValue(1);
2893 }
2894 break;
2895 }
2896 } // switch (...)
2897
2898 if (Opc) {
2899 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
2900
2901 // Emit the AArch64 operation with overflow check.
2902 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
2903 Overflow = Value.getValue(1);
2904 }
2905 return std::make_pair(Value, Overflow);
2906}
2907
2908SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
2909 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
2910 return LowerToScalableOp(Op, DAG);
2911
2912 SDValue Sel = Op.getOperand(0);
2913 SDValue Other = Op.getOperand(1);
2914 SDLoc dl(Sel);
2915
2916 // If the operand is an overflow checking operation, invert the condition
2917 // code and kill the Not operation. I.e., transform:
2918 // (xor (overflow_op_bool, 1))
2919 // -->
2920 // (csel 1, 0, invert(cc), overflow_op_bool)
2921 // ... which later gets transformed to just a cset instruction with an
2922 // inverted condition code, rather than a cset + eor sequence.
2924 // Only lower legal XALUO ops.
2925 if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
2926 return SDValue();
2927
2928 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2929 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2932 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
2933 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2934 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
2935 CCVal, Overflow);
2936 }
2937 // If neither operand is a SELECT_CC, give up.
2938 if (Sel.getOpcode() != ISD::SELECT_CC)
2940 if (Sel.getOpcode() != ISD::SELECT_CC)
2941 return Op;
2942
2943 // The folding we want to perform is:
2944 // (xor x, (select_cc a, b, cc, 0, -1) )
2945 // -->
2946 // (csel x, (xor x, -1), cc ...)
2947 //
2948 // The latter will get matched to a CSINV instruction.
2949
2950 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
2951 SDValue LHS = Sel.getOperand(0);
2952 SDValue RHS = Sel.getOperand(1);
2953 SDValue TVal = Sel.getOperand(2);
2954 SDValue FVal = Sel.getOperand(3);
2955
2956 // FIXME: This could be generalized to non-integer comparisons.
2957 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
2958 return Op;
2959
2962
2963 // The values aren't constants, this isn't the pattern we're looking for.
2964 if (!CFVal || !CTVal)
2965 return Op;
2966
2967 // We can commute the SELECT_CC by inverting the condition. This
2968 // might be needed to make this fit into a CSINV pattern.
2969 if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
2972 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
2973 }
2974
2975 // If the constants line up, perform the transform!
2976 if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
2977 SDValue CCVal;
2978 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2979
2980 FVal = Other;
2981 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
2982 DAG.getConstant(-1ULL, dl, Other.getValueType()));
2983
2984 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
2985 CCVal, Cmp);
2986 }
2987
2988 return Op;
2989}
2990
2992 EVT VT = Op.getValueType();
2993
2994 // Let legalize expand this if it isn't a legal type yet.
2995 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
2996 return SDValue();
2997
2998 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
2999
3000 unsigned Opc;
3001 bool ExtraOp = false;
3002 switch (Op.getOpcode()) {
3003 default:
3004 llvm_unreachable("Invalid code");
3005 case ISD::ADDC:
3006 Opc = AArch64ISD::ADDS;
3007 break;
3008 case ISD::SUBC:
3009 Opc = AArch64ISD::SUBS;
3010 break;
3011 case ISD::ADDE:
3012 Opc = AArch64ISD::ADCS;
3013 ExtraOp = true;
3014 break;
3015 case ISD::SUBE:
3016 Opc = AArch64ISD::SBCS;
3017 ExtraOp = true;
3018 break;
3019 }
3020
3021 if (!ExtraOp)
3022 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
3023 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
3024 Op.getOperand(2));
3025}
3026
3028 // Let legalize expand this if it isn't a legal type yet.
3029 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3030 return SDValue();
3031
3032 SDLoc dl(Op);
3034 // The actual operation that sets the overflow or carry flag.
3036 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3037
3038 // We use 0 and 1 as false and true values.
3039 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3040 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3041
3042 // We use an inverted condition, because the conditional select is inverted
3043 // too. This will allow it to be selected to a single instruction:
3044 // CSINC Wd, WZR, WZR, invert(cond).
3045 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3047 CCVal, Overflow);
3048
3049 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3050 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3051}
3052
3053// Prefetch operands are:
3054// 1: Address to prefetch
3055// 2: bool isWrite
3056// 3: int locality (0 = no locality ... 3 = extreme locality)
3057// 4: bool isDataCache
3059 SDLoc DL(Op);
3060 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
3061 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
3062 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3063
3064 bool IsStream = !Locality;
3065 // When the locality number is set
3066 if (Locality) {
3067 // The front-end should have filtered out the out-of-range values
3068 assert(Locality <= 3 && "Prefetch locality out-of-range");
3069 // The locality degree is the opposite of the cache speed.
3070 // Put the number the other way around.
3071 // The encoding starts at 0 for level 1
3072 Locality = 3 - Locality;
3073 }
3074
3075 // built the mask value encoding the expected behavior.
3076 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
3077 (!IsData << 3) | // IsDataCache bit
3078 (Locality << 1) | // Cache level bits
3079 (unsigned)IsStream; // Stream bit
3080 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
3081 DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
3082}
3083
3084SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
3085 SelectionDAG &DAG) const {
3086 if (Op.getValueType().isScalableVector())
3087 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
3088
3089 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
3090 return SDValue();
3091}
3092
3093SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
3094 SelectionDAG &DAG) const {
3095 if (Op.getValueType().isScalableVector())
3096 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
3097
3098 bool IsStrict = Op->isStrictFPOpcode();
3099 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3100 EVT SrcVT = SrcVal.getValueType();
3101
3102 if (SrcVT != MVT::f128) {
3103 // Expand cases where the input is a vector bigger than NEON.
3104 if (useSVEForFixedLengthVectorVT(SrcVT))
3105 return SDValue();
3106
3107 // It's legal except when f128 is involved
3108 return Op;
3109 }
3110
3111 return SDValue();
3112}
3113
3114SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
3115 SelectionDAG &DAG) const {
3116 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3117 // Any additional optimization in this function should be recorded
3118 // in the cost tables.
3119 EVT InVT = Op.getOperand(0).getValueType();
3120 EVT VT = Op.getValueType();
3121
3122 if (VT.isScalableVector()) {
3123 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
3126 return LowerToPredicatedOp(Op, DAG, Opcode);
3127 }
3128
3129 unsigned NumElts = InVT.getVectorNumElements();
3130
3131 // f16 conversions are promoted to f32 when full fp16 is not supported.
3132 if (InVT.getVectorElementType() == MVT::f16 &&
3133 !Subtarget->hasFullFP16()) {
3135 SDLoc dl(Op);
3136 return DAG.getNode(
3137 Op.getOpcode(), dl, Op.getValueType(),
3138 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
3139 }
3140
3141 uint64_t VTSize = VT.getFixedSizeInBits();
3142 uint64_t InVTSize = InVT.getFixedSizeInBits();
3143 if (VTSize < InVTSize) {
3144 SDLoc dl(Op);
3145 SDValue Cv =
3146 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
3147 Op.getOperand(0));
3148 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
3149 }
3150
3151 if (VTSize > InVTSize) {
3152 SDLoc dl(Op);
3153 MVT ExtVT =
3156 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
3157 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
3158 }
3159
3160 // Type changing conversions are illegal.
3161 return Op;
3162}
3163
3164SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
3165 SelectionDAG &DAG) const {
3166 bool IsStrict = Op->isStrictFPOpcode();
3167 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3168
3169 if (SrcVal.getValueType().isVector())
3170 return LowerVectorFP_TO_INT(Op, DAG);
3171
3172 // f16 conversions are promoted to f32 when full fp16 is not supported.
3173 if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
3174 assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
3175 SDLoc dl(Op);
3176 return DAG.getNode(
3177 Op.getOpcode(), dl, Op.getValueType(),
3179 }
3180
3181 if (SrcVal.getValueType() != MVT::f128) {
3182 // It's legal except when f128 is involved
3183 return Op;
3184 }
3185
3186 return SDValue();
3187}
3188
3189SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
3190 SelectionDAG &DAG) const {
3191 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3192 // Any additional optimization in this function should be recorded
3193 // in the cost tables.
3194 EVT VT = Op.getValueType();
3195 SDLoc dl(Op);
3196 SDValue In = Op.getOperand(0);
3197 EVT InVT = In.getValueType();
3198 unsigned Opc = Op.getOpcode();
3199 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
3200
3201 if (VT.isScalableVector()) {
3202 if (InVT.getVectorElementType() == MVT::i1) {
3203 // We can't directly extend an SVE predicate; extend it first.
3204 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3206 In = DAG.getNode(CastOpc, dl, CastVT, In);
3207 return DAG.getNode(Opc, dl, VT, In);
3208 }
3209
3210 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
3212 return LowerToPredicatedOp(Op, DAG, Opcode);
3213 }
3214
3215 uint64_t VTSize = VT.getFixedSizeInBits();
3216 uint64_t InVTSize = InVT.getFixedSizeInBits();
3217 if (VTSize < InVTSize) {
3218 MVT CastVT =
3219 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
3220 InVT.getVectorNumElements());
3221 In = DAG.getNode(Opc, dl, CastVT, In);
3222 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
3223 }
3224
3225 if (VTSize > InVTSize) {
3226 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3228 In = DAG.getNode(CastOpc, dl, CastVT, In);
3229 return DAG.getNode(Opc, dl, VT, In);
3230 }
3231
3232 return Op;
3233}
3234
3235SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
3236 SelectionDAG &DAG) const {
3237 if (Op.getValueType().isVector())
3238 return LowerVectorINT_TO_FP(Op, DAG);
3239
3240 bool IsStrict = Op->isStrictFPOpcode();
3241 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3242
3243 // f16 conversions are promoted to f32 when full fp16 is not supported.
3244 if (Op.getValueType() == MVT::f16 &&
3245 !Subtarget->hasFullFP16()) {
3246 assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
3247 SDLoc dl(Op);
3248 return DAG.getNode(
3250 DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
3251 DAG.getIntPtrConstant(0, dl));
3252 }
3253
3254 // i128 conversions are libcalls.
3255 if (SrcVal.getValueType() == MVT::i128)
3256 return SDValue();
3257
3258 // Other conversions are legal, unless it's to the completely software-based
3259 // fp128.
3260 if (Op.getValueType() != MVT::f128)
3261 return Op;
3262 return SDValue();
3263}
3264
3265SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
3266 SelectionDAG &DAG) const {
3267 // For iOS, we want to call an alternative entry point: __sincos_stret,
3268 // which returns the values in two S / D registers.
3269 SDLoc dl(Op);
3270 SDValue Arg = Op.getOperand(0);
3271 EVT ArgVT = Arg.getValueType();
3272 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
3273
3274 ArgListTy Args;
3275 ArgListEntry Entry;
3276
3277 Entry.Node = Arg;
3278 Entry.Ty = ArgTy;
3279 Entry.IsSExt = false;
3280 Entry.IsZExt = false;
3281 Args.push_back(Entry);
3282
3283 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
3284 : RTLIB::SINCOS_STRET_F32;
3285 const char *LibcallName = getLibcallName(LC);
3286 SDValue Callee =
3288
3291 CLI.setDebugLoc(dl)
3292 .setChain(DAG.getEntryNode())
3293 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
3294
3295 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3296 return CallResult.first;
3297}
3298
3300 EVT OpVT = Op.getValueType();
3301 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
3302 return SDValue();
3303
3304 assert(Op.getOperand(0).getValueType() == MVT::i16);
3305 SDLoc DL(Op);
3306
3307 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
3308 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
3309 return SDValue(
3310 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
3311 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
3312 0);
3313}
3314
3316 if (OrigVT.getSizeInBits() >= 64)
3317 return OrigVT;
3318
3319 assert(OrigVT.isSimple() && "Expecting a simple value type");
3320
3321 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
3322 switch (OrigSimpleTy) {
3323 default: llvm_unreachable("Unexpected Vector Type");
3324 case MVT::v2i8:
3325 case MVT::v2i16:
3326 return MVT::v2i32;
3327 case MVT::v4i8:
3328 return MVT::v4i16;
3329 }
3330}
3331
3333 const EVT &OrigTy,
3334 const EVT &ExtTy,
3335 unsigned ExtOpcode) {
3336 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
3337 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
3338 // 64-bits we need to insert a new extension so that it will be 64-bits.
3339 assert(ExtTy.is128BitVector() && "Unexpected extension size");
3340 if (OrigTy.getSizeInBits() >= 64)
3341 return N;
3342
3343 // Must extend size to at least 64 bits to be used as an operand for VMULL.
3344 EVT NewVT = getExtensionTo64Bits(OrigTy);
3345
3346 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
3347}
3348
3350 bool isSigned) {
3351 EVT VT = N->getValueType(0);
3352
3353 if (N->getOpcode() != ISD::BUILD_VECTOR)
3354 return false;
3355
3356 for (const SDValue &Elt : N->op_values()) {
3358 unsigned EltSize = VT.getScalarSizeInBits();
3359 unsigned HalfSize = EltSize / 2;
3360 if (isSigned) {
3361 if (!isIntN(HalfSize, C->getSExtValue()))
3362 return false;
3363 } else {
3364 if (!isUIntN(HalfSize, C->getZExtValue()))
3365 return false;
3366 }
3367 continue;
3368 }
3369 return false;
3370 }
3371
3372 return true;
3373}
3374
3376 if (N->getOpcode() == ISD::SIGN_EXTEND ||
3377 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
3378 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
3379 N->getOperand(0)->getValueType(0),
3380 N->getValueType(0),
3381 N->getOpcode());
3382
3383 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
3384 EVT VT = N->getValueType(0);
3385 SDLoc dl(N);
3386 unsigned EltSize = VT.getScalarSizeInBits() / 2;
3387 unsigned NumElts = VT.getVectorNumElements();
3388 MVT TruncVT = MVT::getIntegerVT(EltSize);
3390 for (unsigned i = 0; i != NumElts; ++i) {
3391 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
3392 const APInt &CInt = C->getAPIntValue();
3393 // Element types smaller than 32 bits are not legal, so use i32 elements.
3394 // The values are implicitly truncated so sext vs. zext doesn't matter.
3395 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
3396 }
3397 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
3398}
3399
3401 return N->getOpcode() == ISD::SIGN_EXTEND ||
3402 N->getOpcode() == ISD::ANY_EXTEND ||
3403 isExtendedBUILD_VECTOR(N, DAG, true);
3404}
3405
3407 return N->getOpcode() == ISD::ZERO_EXTEND ||
3408 N->getOpcode() == ISD::ANY_EXTEND ||
3409 isExtendedBUILD_VECTOR(N, DAG, false);
3410}
3411
3412static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
3413 unsigned Opcode = N->getOpcode();
3414 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
3415 SDNode *N0 = N->getOperand(0).getNode();
3416 SDNode *N1 = N->getOperand(1).getNode();
3417 return N0->hasOneUse() && N1->hasOneUse() &&
3418 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
3419 }
3420 return false;
3421}
3422
3423static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
3424 unsigned Opcode = N->getOpcode();
3425 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
3426 SDNode *N0 = N->getOperand(0).getNode();
3427 SDNode *N1 = N->getOperand(1).getNode();
3428 return N0->hasOneUse() && N1->hasOneUse() &&
3429 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
3430 }
3431 return false;
3432}
3433
3434SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
3435 SelectionDAG &DAG) const {
3436 // The rounding mode is in bits 23:22 of the FPSCR.
3437 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
3438 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
3439 // so that the shift + and get folded into a bitfield extract.
3440 SDLoc dl(Op);
3441
3442 SDValue Chain = Op.getOperand(0);
3443 SDValue FPCR_64 = DAG.getNode(
3445 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
3446 Chain = FPCR_64.getValue(1);
3449 DAG.getConstant(1U << 22, dl, MVT::i32));
3451 DAG.getConstant(22, dl, MVT::i32));
3453 DAG.getConstant(3, dl, MVT::i32));
3454 return DAG.getMergeValues({AND, Chain}, dl);
3455}
3456
3457SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
3458 EVT VT = Op.getValueType();
3459
3460 // If SVE is available then i64 vector multiplications can also be made legal.
3461 bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
3462
3463 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
3464 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON);
3465
3466 // Multiplications are only custom-lowered for 128-bit vectors so that
3467 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
3468 assert(VT.is128BitVector() && VT.isInteger() &&
3469 "unexpected type for custom-lowering ISD::MUL");
3470 SDNode *N0 = Op.getOperand(0).getNode();
3471 SDNode *N1 = Op.getOperand(1).getNode();
3472 unsigned NewOpc = 0;
3473 bool isMLA = false;
3474 bool isN0SExt = isSignExtended(N0, DAG);
3475 bool isN1SExt = isSignExtended(N1, DAG);
3476 if (isN0SExt && isN1SExt)
3478 else {
3479 bool isN0ZExt = isZeroExtended(N0, DAG);
3480 bool isN1ZExt = isZeroExtended(N1, DAG);
3481 if (isN0ZExt && isN1ZExt)
3483 else if (isN1SExt || isN1ZExt) {
3484 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
3485 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
3486 if (isN1SExt && isAddSubSExt(N0, DAG)) {
3488 isMLA = true;
3489 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
3491 isMLA = true;
3492 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
3493 std::swap(N0, N1);
3495 isMLA = true;
3496 }
3497 }
3498
3499 if (!NewOpc) {
3500 if (VT == MVT::v2i64)
3501 // Fall through to expand this. It is not legal.
3502 return SDValue();
3503 else
3504 // Other vector multiplications are legal.
3505 return Op;
3506 }
3507 }
3508
3509 // Legalize to a S/UMULL instruction
3510 SDLoc DL(Op);
3511 SDValue Op0;
3513 if (!isMLA) {
3514 Op0 = skipExtensionForVectorMULL(N0, DAG);
3516 Op1.getValueType().is64BitVector() &&
3517 "unexpected types for extended operands to VMULL");
3518 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
3519 }
3520 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
3521 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
3522 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
3525 EVT Op1VT = Op1.getValueType();
3526 return DAG.getNode(N0->getOpcode(), DL, VT,
3527 DAG.getNode(NewOpc, DL, VT,
3528 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
3529 DAG.getNode(NewOpc, DL, VT,
3530 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
3531}
3532
3533static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
3534 int Pattern) {
3535 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
3537}
3538
3539SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
3540 SelectionDAG &DAG) const {
3541 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3542 SDLoc dl(Op);
3543 switch (IntNo) {
3544 default: return SDValue(); // Don't custom lower most intrinsics.
3545 case Intrinsic::thread_pointer: {
3547 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
3548 }
3549 case Intrinsic::aarch64_neon_abs: {
3550 EVT Ty = Op.getValueType();
3551 if (Ty == MVT::i64) {
3553 Op.getOperand(1));
3554 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
3555 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
3556 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
3557 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
3558 } else {
3559 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
3560 }
3561 }
3562 case Intrinsic::aarch64_neon_smax:
3563 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
3564 Op.getOperand(1), Op.getOperand(2));
3565 case Intrinsic::aarch64_neon_umax:
3566 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
3567 Op.getOperand(1), Op.getOperand(2));
3568 case Intrinsic::aarch64_neon_smin:
3569 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
3570 Op.getOperand(1), Op.getOperand(2));
3571 case Intrinsic::aarch64_neon_umin:
3572 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
3573 Op.getOperand(1), Op.getOperand(2));
3574
3575 case Intrinsic::aarch64_sve_sunpkhi:
3576 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
3577 Op.getOperand(1));
3578 case Intrinsic::aarch64_sve_sunpklo:
3579 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
3580 Op.getOperand(1));
3581 case Intrinsic::aarch64_sve_uunpkhi:
3582 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
3583 Op.getOperand(1));
3584 case Intrinsic::aarch64_sve_uunpklo:
3585 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
3586 Op.getOperand(1));
3587 case Intrinsic::aarch64_sve_clasta_n:
3588 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
3589 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3590 case Intrinsic::aarch64_sve_clastb_n:
3591 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
3592 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3593 case Intrinsic::aarch64_sve_lasta:
3594 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
3595 Op.getOperand(1), Op.getOperand(2));
3596 case Intrinsic::aarch64_sve_lastb:
3597 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
3598 Op.getOperand(1), Op.getOperand(2));
3599 case Intrinsic::aarch64_sve_rev:
3600 return DAG.getNode(AArch64ISD::REV, dl, Op.getValueType(),
3601 Op.getOperand(1));
3602 case Intrinsic::aarch64_sve_tbl:
3603 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
3604 Op.getOperand(1), Op.getOperand(2));
3605 case Intrinsic::aarch64_sve_trn1:
3606 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
3607 Op.getOperand(1), Op.getOperand(2));
3608 case Intrinsic::aarch64_sve_trn2:
3609 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
3610 Op.getOperand(1), Op.getOperand(2));
3611 case Intrinsic::aarch64_sve_uzp1:
3612 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
3613 Op.getOperand(1), Op.getOperand(2));
3614 case Intrinsic::aarch64_sve_uzp2:
3615 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
3616 Op.getOperand(1), Op.getOperand(2));
3617 case Intrinsic::aarch64_sve_zip1:
3618 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
3619 Op.getOperand(1), Op.getOperand(2));
3620 case Intrinsic::aarch64_sve_zip2:
3621 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
3622 Op.getOperand(1), Op.getOperand(2));
3623 case Intrinsic::aarch64_sve_ptrue:
3624 return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
3625 Op.getOperand(1));
3626 case Intrinsic::aarch64_sve_clz:
3627 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
3628 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3629 case Intrinsic::aarch64_sve_cnt: {
3630 SDValue Data = Op.getOperand(3);
3631 // CTPOP only supports integer operands.
3632 if (Data.getValueType().isFloatingPoint())
3633 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
3634 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
3635 Op.getOperand(2), Data, Op.getOperand(1));
3636 }
3637 case Intrinsic::aarch64_sve_dupq_lane:
3638 return LowerDUPQLane(Op, DAG);
3639 case Intrinsic::aarch64_sve_convert_from_svbool:
3640 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
3641 Op.getOperand(1));
3642 case Intrinsic::aarch64_sve_fneg:
3643 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
3644 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3645 case Intrinsic::aarch64_sve_frintp:
3646 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
3647 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3648 case Intrinsic::aarch64_sve_frintm:
3649 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
3650 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3651 case Intrinsic::aarch64_sve_frinti:
3652 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
3653 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3654 case Intrinsic::aarch64_sve_frintx:
3655 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
3656 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3657 case Intrinsic::aarch64_sve_frinta:
3658 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
3659 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3660 case Intrinsic::aarch64_sve_frintn:
3661 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
3662 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3663 case Intrinsic::aarch64_sve_frintz:
3664 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
3665 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3666 case Intrinsic::aarch64_sve_ucvtf:
3668 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
3669 Op.getOperand(1));
3670 case Intrinsic::aarch64_sve_scvtf:
3672 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
3673 Op.getOperand(1));
3674 case Intrinsic::aarch64_sve_fcvtzu:
3676 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
3677 Op.getOperand(1));
3678 case Intrinsic::aarch64_sve_fcvtzs:
3680 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
3681 Op.getOperand(1));
3682 case Intrinsic::aarch64_sve_fsqrt:
3683 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
3684 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3685 case Intrinsic::aarch64_sve_frecpx:
3686 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
3687 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3688 case Intrinsic::aarch64_sve_fabs:
3689 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
3690 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3691 case Intrinsic::aarch64_sve_abs:
3692 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
3693 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3694 case Intrinsic::aarch64_sve_neg:
3695 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
3696 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3697 case Intrinsic::aarch64_sve_convert_to_svbool: {
3698 EVT OutVT = Op.getValueType();
3699 EVT InVT = Op.getOperand(1).getValueType();
3700 // Return the operand if the cast isn't changing type,
3701 // i.e. <n x 16 x i1> -> <n x 16 x i1>
3702 if (InVT == OutVT)
3703 return Op.getOperand(1);
3704 // Otherwise, zero the newly introduced lanes.
3706 DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Op.getOperand(1));
3707 SDValue Mask = getPTrue(DAG, dl, InVT, AArch64SVEPredPattern::all);
3711 }
3712
3713 case Intrinsic::aarch64_sve_insr: {
3714 SDValue Scalar = Op.getOperand(2);
3715 EVT ScalarTy = Scalar.getValueType();
3716 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
3717 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
3718
3719 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
3720 Op.getOperand(1), Scalar);
3721 }
3722 case Intrinsic::aarch64_sve_rbit:
3724 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
3725 Op.getOperand(1));
3726 case Intrinsic::aarch64_sve_revb:
3727 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
3728 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3729 case Intrinsic::aarch64_sve_sxtb:
3730 return DAG.getNode(
3732 Op.getOperand(2), Op.getOperand(3),
3733 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
3734 Op.getOperand(1));
3735 case Intrinsic::aarch64_sve_sxth:
3736 return DAG.getNode(
3738 Op.getOperand(2), Op.getOperand(3),
3739 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
3740 Op.getOperand(1));
3741 case Intrinsic::aarch64_sve_sxtw:
3742 return DAG.getNode(
3744 Op.getOperand(2), Op.getOperand(3),
3745 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
3746 Op.getOperand(1));
3747 case Intrinsic::aarch64_sve_uxtb:
3748 return DAG.getNode(
3750 Op.getOperand(2), Op.getOperand(3),
3751 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
3752 Op.getOperand(1));
3753 case Intrinsic::aarch64_sve_uxth:
3754 return DAG.getNode(
3756 Op.getOperand(2), Op.getOperand(3),
3757 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
3758 Op.getOperand(1));
3759 case Intrinsic::aarch64_sve_uxtw:
3760 return DAG.getNode(
3762 Op.getOperand(2), Op.getOperand(3),
3763 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
3764 Op.getOperand(1));
3765
3766 case Intrinsic::localaddress: {
3767 const auto &MF = DAG.getMachineFunction();
3768 const auto *RegInfo = Subtarget->getRegisterInfo();
3769 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
3770 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
3771 Op.getSimpleValueType());
3772 }
3773
3774 case Intrinsic::eh_recoverfp: {
3775 // FIXME: This needs to be implemented to correctly handle highly aligned
3776 // stack objects. For now we simply return the incoming FP. Refer D53541
3777 // for more details.
3778 SDValue FnOp = Op.getOperand(1);
3779 SDValue IncomingFPOp = Op.getOperand(2);
3781 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
3782 if (!Fn)
3784 "llvm.eh.recoverfp must take a function as the first argument");
3785 return IncomingFPOp;
3786 }
3787
3788 case Intrinsic::aarch64_neon_vsri:
3789 case Intrinsic::aarch64_neon_vsli: {
3790 EVT Ty = Op.getValueType();
3791
3792 if (!Ty.isVector())
3793 report_fatal_error("Unexpected type for aarch64_neon_vsli");
3794
3795 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
3796
3797 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
3798 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
3799 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
3800 Op.getOperand(3));
3801 }
3802
3803 case Intrinsic::aarch64_neon_srhadd:
3804 case Intrinsic::aarch64_neon_urhadd:
3805 case Intrinsic::aarch64_neon_shadd:
3806 case Intrinsic::aarch64_neon_uhadd: {
3807 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
3808 IntNo == Intrinsic::aarch64_neon_shadd);
3809 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
3810 IntNo == Intrinsic::aarch64_neon_urhadd);
3811 unsigned Opcode =
3813 : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD);
3814 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
3815 Op.getOperand(2));
3816 }
3817
3818 case Intrinsic::aarch64_neon_uabd: {
3819 return DAG.getNode(AArch64ISD::UABD, dl, Op.getValueType(),
3820 Op.getOperand(1), Op.getOperand(2));
3821 }
3822 case Intrinsic::aarch64_neon_sabd: {
3823 return DAG.getNode(AArch64ISD::SABD, dl, Op.getValueType(),
3824 Op.getOperand(1), Op.getOperand(2));
3825 }
3826 }
3827}
3828
3829bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
3830 if (VT.getVectorElementType() == MVT::i32 &&
3832 return true;
3833
3834 return false;
3835}
3836
3837bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
3838 return ExtVal.getValueType().isScalableVector();
3839}
3840
3841unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
3842 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
3843 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
3845 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
3847 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
3849 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
3851 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
3853 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
3855 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
3857 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
3859 };
3860 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
3861 return AddrModes.find(Key)->second;
3862}
3863
3864unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
3865 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
3866 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
3868 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
3870 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
3872 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
3874 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
3876 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
3878 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
3880 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
3882 };
3883 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
3884 return AddrModes.find(Key)->second;
3885}
3886
3908
3910 unsigned Opcode = Index.getOpcode();
3911 if (Opcode == ISD::SIGN_EXTEND_INREG)
3912 return true;
3913
3914 if (Opcode == ISD::AND) {
3915 SDValue Splat = Index.getOperand(1);
3916 if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
3917 return false;
3919 if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF)
3920 return false;
3921 return true;
3922 }
3923
3924 return false;
3925}
3926
3927// If the base pointer of a masked gather or scatter is null, we
3928// may be able to swap BasePtr & Index and use the vector + register
3929// or vector + immediate addressing mode, e.g.
3930// VECTOR + REGISTER:
3931// getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices)
3932// -> getelementptr %offset, <vscale x N x T> %indices
3933// VECTOR + IMMEDIATE:
3934// getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices)
3935// -> getelementptr #x, <vscale x N x T> %indices
3937 unsigned &Opcode, bool IsGather,
3938 SelectionDAG &DAG) {
3939 if (!isNullConstant(BasePtr))
3940 return;
3941
3942 ConstantSDNode *Offset = nullptr;
3943 if (Index.getOpcode() == ISD::ADD)
3944 if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) {
3947 else {
3948 BasePtr = SplatVal;
3949 Index = Index->getOperand(0);
3950 return;
3951 }
3952 }
3953
3954 unsigned NewOp =
3956
3957 if (!Offset) {
3958 std::swap(BasePtr, Index);
3959 Opcode = NewOp;
3960 return;
3961 }
3962
3963 uint64_t OffsetVal = Offset->getZExtValue();
3964 unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8;
3965 auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64);
3966
3968 // Index is out of range for the immediate addressing mode
3969 BasePtr = ConstOffset;
3970 Index = Index->getOperand(0);
3971 return;
3972 }
3973
3974 // Immediate is in range
3975 Opcode = NewOp;
3976 BasePtr = Index->getOperand(0);
3977 Index = ConstOffset;
3978}
3979
3980SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
3981 SelectionDAG &DAG) const {
3982 SDLoc DL(Op);
3984 assert(MGT && "Can only custom lower gather load nodes");
3985
3986 SDValue Index = MGT->getIndex();
3987 SDValue Chain = MGT->getChain();
3988 SDValue PassThru = MGT->getPassThru();
3989 SDValue Mask = MGT->getMask();
3990 SDValue BasePtr = MGT->getBasePtr();
3991 ISD::LoadExtType ExtTy = MGT->getExtensionType();
3992
3993 ISD::MemIndexType IndexType = MGT->getIndexType();
3994 bool IsScaled =
3995 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
3996 bool IsSigned =
3997 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
3998 bool IdxNeedsExtend =
4000 Index.getSimpleValueType().getVectorElementType() == MVT::i32;
4001 bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
4002
4003 EVT VT = PassThru.getSimpleValueType();
4004 EVT MemVT = MGT->getMemoryVT();
4006
4007 if (VT.getVectorElementType() == MVT::bf16 &&
4008 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
4009 return SDValue();
4010
4011 // Handle FP data by using an integer gather and casting the result.
4012 if (VT.isFloatingPoint()) {
4014 PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG);
4015 InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
4016 }
4017
4018 SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other);
4019
4021 Index = Index.getOperand(0);
4022
4023 unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend);
4024 selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
4025 /*isGather=*/true, DAG);
4026
4028 Opcode = getSignExtendedGatherOpcode(Opcode);
4029
4030 SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
4031 SDValue Gather = DAG.getNode(Opcode, DL, VTs, Ops);
4032
4033 if (VT.isFloatingPoint()) {
4034 SDValue Cast = getSVESafeBitCast(VT, Gather, DAG);
4035 return DAG.getMergeValues({Cast, Gather}, DL);
4036 }
4037
4038 return Gather;
4039}
4040
4041SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
4042 SelectionDAG &DAG) const {
4043 SDLoc DL(Op);
4045 assert(MSC && "Can only custom lower scatter store nodes");
4046
4047 SDValue Index = MSC->getIndex();
4048 SDValue Chain = MSC->getChain();
4049 SDValue StoreVal = MSC->getValue();
4050 SDValue Mask = MSC->getMask();
4051 SDValue BasePtr = MSC->getBasePtr();
4052
4053 ISD::MemIndexType IndexType = MSC->getIndexType();
4054 bool IsScaled =
4055 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
4056 bool IsSigned =
4057 IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
4058 bool NeedsExtend =
4060 Index.getSimpleValueType().getVectorElementType() == MVT::i32;
4061
4062 EVT VT = StoreVal.getSimpleValueType();
4063 SDVTList VTs = DAG.getVTList(MVT::Other);
4064 EVT MemVT = MSC->getMemoryVT();
4066
4067 if (VT.getVectorElementType() == MVT::bf16 &&
4068 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
4069 return SDValue();
4070
4071 // Handle FP data by casting the data so an integer scatter can be used.
4072 if (VT.isFloatingPoint()) {
4074 StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG);
4075 InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
4076 }
4077
4079 Index = Index.getOperand(0);
4080
4081 unsigned Opcode = getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend);
4082 selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
4083 /*isGather=*/false, DAG);
4084
4085 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
4086 return DAG.getNode(Opcode, DL, VTs, Ops);
4087}
4088
4089// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
4091 EVT VT, EVT MemVT,
4092 SelectionDAG &DAG) {
4093 assert(VT.isVector() && "VT should be a vector type");
4094 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
4095
4096 SDValue Value = ST->getValue();
4097
4098 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
4099 // the word lane which represent the v4i8 subvector. It optimizes the store
4100 // to:
4101 //
4102 // xtn v0.8b, v0.8h
4103 // str s0, [x0]
4104
4105 SDValue Undef = DAG.getUNDEF(MVT::i16);
4107 {Undef, Undef, Undef, Undef});
4108
4110 Value, UndefVec);
4112
4113 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
4115 Trunc, DAG.getConstant(0, DL, MVT::i64));
4116
4117 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
4118 ST->getBasePtr(), ST->getMemOperand());
4119}
4120
4121// Custom lowering for any store, vector or scalar and/or default or with
4122// a truncate operations. Currently only custom lower truncate operation
4123// from vector v4i16 to v4i8 or volatile stores of i128.
4124SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
4125 SelectionDAG &DAG) const {
4126 SDLoc Dl(Op);
4128 assert (StoreNode && "Can only custom lower store nodes");
4129
4130 SDValue Value = StoreNode->getValue();
4131
4132 EVT VT = Value.getValueType();
4133 EVT MemVT = StoreNode->getMemoryVT();
4134
4135 if (VT.isVector()) {
4136 if (useSVEForFixedLengthVectorVT(VT))
4137 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
4138
4139 unsigned AS = StoreNode->getAddressSpace();
4140 Align Alignment = StoreNode->getAlign();
4141 if (Alignment < MemVT.getStoreSize() &&
4142 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(),
4143 StoreNode->getMemOperand()->getFlags(),
4144 nullptr)) {
4145 return scalarizeVectorStore(StoreNode, DAG);
4146 }
4147
4148 if (StoreNode->isTruncatingStore()) {
4149 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
4150 }
4151 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
4152 // the custom lowering, as there are no un-paired non-temporal stores and
4153 // legalization will break up 256 bit inputs.
4154 ElementCount EC = MemVT.getVectorElementCount();
4155 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
4156 EC.isKnownEven() &&
4157 ((MemVT.getScalarSizeInBits() == 8u ||
4158 MemVT.getScalarSizeInBits() == 16u ||
4159 MemVT.getScalarSizeInBits() == 32u ||
4160 MemVT.getScalarSizeInBits() == 64u))) {
4161 SDValue Lo =
4163 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
4164 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
4165 SDValue Hi =
4167 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
4168 StoreNode->getValue(),
4169 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
4172 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
4173 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
4174 return Result;
4175 }
4176 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
4177 assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
4178 SDValue Lo =
4180 DAG.getConstant(0, Dl, MVT::i64));
4181 SDValue Hi =
4183 DAG.getConstant(1, Dl, MVT::i64));
4186 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
4187 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
4188 return Result;
4189 }
4190
4191 return SDValue();
4192}
4193
4194// Generate SUBS and CSEL for integer abs.
4195SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
4196 MVT VT = Op.getSimpleValueType();
4197
4198 if (VT.isVector())
4199 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
4200
4201 SDLoc DL(Op);
4202 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
4203 Op.getOperand(0));
4204 // Generate SUBS & CSEL.
4205 SDValue Cmp =
4207 Op.getOperand(0), DAG.getConstant(0, DL, VT));
4208 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
4210 Cmp.getValue(1));
4211}
4212
4214 SelectionDAG &DAG) const {
4215 LLVM_DEBUG(dbgs() << "Custom lowering: ");
4216 LLVM_DEBUG(Op.dump());
4217
4218 switch (Op.getOpcode()) {
4219 default:
4220 llvm_unreachable("unimplemented operand");
4221 return SDValue();
4222 case ISD::BITCAST:
4223 return LowerBITCAST(Op, DAG);
4224 case ISD::GlobalAddress:
4225 return LowerGlobalAddress(Op, DAG);
4227 return LowerGlobalTLSAddress(Op, DAG);
4228 case ISD::SETCC:
4229 case ISD::STRICT_FSETCC:
4231 return LowerSETCC(Op, DAG);
4232 case ISD::BR_CC:
4233 return LowerBR_CC(Op, DAG);
4234 case ISD::SELECT:
4235 return LowerSELECT(Op, DAG);
4236 case ISD::SELECT_CC:
4237 return LowerSELECT_CC(Op, DAG);
4238 case ISD::JumpTable:
4239 return LowerJumpTable(Op, DAG);
4240 case ISD::BR_JT:
4241 return LowerBR_JT(Op, DAG);
4242 case ISD::ConstantPool:
4243 return LowerConstantPool(Op, DAG);
4244 case ISD::BlockAddress:
4245 return LowerBlockAddress(Op, DAG);
4246 case ISD::VASTART:
4247 return LowerVASTART(Op, DAG);
4248 case ISD::VACOPY:
4249 return LowerVACOPY(Op, DAG);
4250 case ISD::VAARG:
4251 return LowerVAARG(Op, DAG);
4252 case ISD::ADDC:
4253 case ISD::ADDE:
4254 case ISD::SUBC:
4255 case ISD::SUBE:
4256 return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
4257 case ISD::SADDO:
4258 case ISD::UADDO:
4259 case ISD::SSUBO:
4260 case ISD::USUBO:
4261 case ISD::SMULO:
4262 case ISD::UMULO:
4263 return LowerXALUO(Op, DAG);
4264 case ISD::FADD:
4265 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
4266 case ISD::FSUB:
4267 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
4268 case ISD::FMUL:
4269 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
4270 case ISD::FMA:
4271 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
4272 case ISD::FDIV:
4273 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
4274 case ISD::FNEG:
4275 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
4276 case ISD::FCEIL:
4277 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
4278 case ISD::FFLOOR:
4279 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
4280 case ISD::FNEARBYINT:
4281 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
4282 case ISD::FRINT:
4283 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
4284 case ISD::FROUND:
4285 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
4286 case ISD::FROUNDEVEN:
4287 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
4288 case ISD::FTRUNC:
4289 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
4290 case ISD::FSQRT:
4291 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
4292 case ISD::FABS:
4293 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
4294 case ISD::FP_ROUND:
4296 return LowerFP_ROUND(Op, DAG);
4297 case ISD::FP_EXTEND:
4298 return LowerFP_EXTEND(Op, DAG);
4299 case ISD::FRAMEADDR:
4300 return LowerFRAMEADDR(Op, DAG);
4301 case ISD::SPONENTRY:
4302 return LowerSPONENTRY(Op, DAG);
4303 case ISD::RETURNADDR:
4304 return LowerRETURNADDR(Op, DAG);
4306 return LowerADDROFRETURNADDR(Op, DAG);
4308 return LowerCONCAT_VECTORS(Op, DAG);
4310 return LowerINSERT_VECTOR_ELT(Op, DAG);
4312 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
4313 case ISD::BUILD_VECTOR:
4314 return LowerBUILD_VECTOR(Op, DAG);
4316 return LowerVECTOR_SHUFFLE(Op, DAG);
4317 case ISD::SPLAT_VECTOR:
4318 return LowerSPLAT_VECTOR(Op, DAG);
4320 return LowerEXTRACT_SUBVECTOR(Op, DAG);
4322 return LowerINSERT_SUBVECTOR(Op, DAG);
4323 case ISD::SDIV:
4324 case ISD::UDIV:
4325 return LowerDIV(Op, DAG);
4326 case ISD::SMIN:
4327 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED,
4328 /*OverrideNEON=*/true);
4329 case ISD::UMIN:
4330 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED,
4331 /*OverrideNEON=*/true);
4332 case ISD::SMAX:
4333 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED,
4334 /*OverrideNEON=*/true);
4335 case ISD::UMAX:
4336 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED,
4337 /*OverrideNEON=*/true);
4338 case ISD::SRA:
4339 case ISD::SRL:
4340 case ISD::SHL:
4341 return LowerVectorSRA_SRL_SHL(Op, DAG);
4342 case ISD::SHL_PARTS:
4343 return LowerShiftLeftParts(Op, DAG);
4344 case ISD::SRL_PARTS:
4345 case ISD::SRA_PARTS:
4346 return LowerShiftRightParts(Op, DAG);
4347 case ISD::CTPOP:
4348 return LowerCTPOP(Op, DAG);
4349 case ISD::FCOPYSIGN:
4350 return LowerFCOPYSIGN(Op, DAG);
4351 case ISD::OR:
4352 return LowerVectorOR(Op, DAG);
4353 case ISD::XOR:
4354 return LowerXOR(Op, DAG);
4355 case ISD::PREFETCH:
4356 return LowerPREFETCH(Op, DAG);
4357 case ISD::SINT_TO_FP:
4358 case ISD::UINT_TO_FP:
4361 return LowerINT_TO_FP(Op, DAG);
4362 case ISD::FP_TO_SINT:
4363 case ISD::FP_TO_UINT:
4366 return LowerFP_TO_INT(Op, DAG);
4367 case ISD::FSINCOS:
4368 return LowerFSINCOS(Op, DAG);
4369 case ISD::FLT_ROUNDS_:
4370 return LowerFLT_ROUNDS_(Op, DAG);
4371 case ISD::MUL:
4372 return LowerMUL(Op, DAG);
4374 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
4375 case ISD::STORE:
4376 return LowerSTORE(Op, DAG);
4377 case ISD::MGATHER:
4378 return LowerMGATHER(Op, DAG);
4379 case ISD::MSCATTER:
4380 return LowerMSCATTER(Op, DAG);
4382 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
4383 case ISD::VECREDUCE_ADD:
4384 case ISD::VECREDUCE_AND:
4385 case ISD::VECREDUCE_OR:
4386 case ISD::VECREDUCE_XOR:
4394 return LowerVECREDUCE(Op, DAG);
4396 return LowerATOMIC_LOAD_SUB(Op, DAG);
4398 return LowerATOMIC_LOAD_AND(Op, DAG);
4400 return LowerDYNAMIC_STACKALLOC(Op, DAG);
4401 case ISD::VSCALE:
4402 return LowerVSCALE(Op, DAG);
4403 case ISD::ANY_EXTEND:
4404 case ISD::SIGN_EXTEND:
4405 case ISD::ZERO_EXTEND:
4406 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
4408 // Only custom lower when ExtraVT has a legal byte based element type.
4409 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4410 EVT ExtraEltVT = ExtraVT.getVectorElementType();
4411 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
4413 return SDValue();
4414
4415 return LowerToPredicatedOp(Op, DAG,
4417 }
4418 case ISD::TRUNCATE:
4419 return LowerTRUNCATE(Op, DAG);
4420 case ISD::LOAD:
4421 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
4422 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
4423 llvm_unreachable("Unexpected request to lower ISD::LOAD");
4424 case ISD::ADD:
4425 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
4426 case ISD::AND:
4427 return LowerToScalableOp(Op, DAG);
4428 case ISD::SUB:
4429 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED);
4430 case ISD::FMAXNUM:
4431 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
4432 case ISD::FMINNUM:
4433 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
4434 case ISD::VSELECT:
4435 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
4436 case ISD::ABS:
4437 return LowerABS(Op, DAG);
4438 case ISD::BITREVERSE:
4439 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU,
4440 /*OverrideNEON=*/true);
4441 case ISD::BSWAP:
4442 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
4443 case ISD::CTLZ:
4444 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU,
4445 /*OverrideNEON=*/true);
4446 case ISD::CTTZ:
4447 return LowerCTTZ(Op, DAG);
4448 }
4449}
4450
4454
4455bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
4456 EVT VT, bool OverrideNEON) const {
4457 if (!Subtarget->useSVEForFixedLengthVectors())
4458 return false;
4459
4460 if (!VT.isFixedLengthVector())
4461 return false;
4462
4463 // Don't use SVE for vectors we cannot scalarize if required.
4464 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
4465 // Fixed length predicates should be promoted to i8.
4466 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
4467 case MVT::i1:
4468 default:
4469 return false;
4470 case MVT::i8:
4471 case MVT::i16:
4472 case MVT::i32:
4473 case MVT::i64:
4474 case MVT::f16:
4475 case MVT::f32:
4476 case MVT::f64:
4477 break;
4478 }
4479
4480 // All SVE implementations support NEON sized vectors.
4481 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
4482 return true;
4483
4484 // Ensure NEON MVTs only belong to a single register class.
4485 if (VT.getFixedSizeInBits() <= 128)
4486 return false;
4487
4488 // Don't use SVE for types that don't fit.
4489 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
4490 return false;
4491
4492 // TODO: Perhaps an artificial restriction, but worth having whilst getting
4493 // the base fixed length SVE support in place.
4494 if (!VT.isPow2VectorType())
4495 return false;
4496
4497 return true;
4498}
4499
4500//===----------------------------------------------------------------------===//
4501// Calling Convention Implementation
4502//===----------------------------------------------------------------------===//
4503
4504/// Selects the correct CCAssignFn for a given CallingConvention value.
4506 bool IsVarArg) const {
4507 switch (CC) {
4508 default:
4509 report_fatal_error("Unsupported calling convention.");
4511 return CC_AArch64_WebKit_JS;
4512 case CallingConv::GHC:
4513 return CC_AArch64_GHC;
4514 case CallingConv::C:
4515 case CallingConv::Fast:
4518 case CallingConv::Swift:
4519 if (Subtarget->isTargetWindows() && IsVarArg)
4521 if (!Subtarget->isTargetDarwin())
4522 return CC_AArch64_AAPCS;
4523 if (!IsVarArg)
4524 return CC_AArch64_DarwinPCS;
4527 case CallingConv::Win64:
4528 return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
4533 return CC_AArch64_AAPCS;
4534 }
4535}
4536
4537CCAssignFn *
4542
4543SDValue AArch64TargetLowering::LowerFormalArguments(
4544 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4545 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
4546 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4548 MachineFrameInfo &MFI = MF.getFrameInfo();
4549 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
4550
4551 // Assign locations to all of the incoming arguments.
4554 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4555 *DAG.getContext());
4556
4557 // At this point, Ins[].VT may already be promoted to i32. To correctly
4558 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
4559 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
4560 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
4561 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
4562 // LocVT.
4563 unsigned NumArgs = Ins.size();
4565 unsigned CurArgIdx = 0;
4566 for (unsigned i = 0; i != NumArgs; ++i) {
4567 MVT ValVT = Ins[i].VT;
4568 if (Ins[i].isOrigArg()) {
4569 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
4570 CurArgIdx = Ins[i].getOrigArgIndex();
4571
4572 // Get type of the original argument.
4573 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
4574 /*AllowUnknown*/ true);
4575 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
4576 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
4577 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
4578 ValVT = MVT::i8;
4579 else if (ActualMVT == MVT::i16)
4580 ValVT = MVT::i16;
4581 }
4582 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
4583 bool Res =
4584 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
4585 assert(!Res && "Call operand has unhandled type");
4586 (void)Res;
4587 }
4589 unsigned ExtraArgLocs = 0;
4590 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4592
4593 if (Ins[i].Flags.isByVal()) {
4594 // Byval is used for HFAs in the PCS, but the system should work in a
4595 // non-compliant manner for larger structs.
4597 int Size = Ins[i].Flags.getByValSize();
4598 unsigned NumRegs = (Size + 7) / 8;
4599
4600 // FIXME: This works on big-endian for composite byvals, which are the common
4601 // case. It should also work for fundamental types too.
4602 unsigned FrameIdx =
4603 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
4604 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
4605 InVals.push_back(FrameIdxN);
4606
4607 continue;
4608 }
4609
4610 SDValue ArgValue;
4611 if (VA.isRegLoc()) {
4612 // Arguments stored in registers.
4613 EVT RegVT = VA.getLocVT();
4614 const TargetRegisterClass *RC;
4615
4616 if (RegVT == MVT::i32)
4617 RC = &AArch64::GPR32RegClass;
4618 else if (RegVT == MVT::i64)
4619 RC = &AArch64::GPR64RegClass;
4620 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4621 RC = &AArch64::FPR16RegClass;
4622 else if (RegVT == MVT::f32)
4623 RC = &AArch64::FPR32RegClass;
4624 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
4625 RC = &AArch64::FPR64RegClass;
4626 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
4627 RC = &AArch64::FPR128RegClass;
4628 else if (RegVT.isScalableVector() &&
4629 RegVT.getVectorElementType() == MVT::i1)
4630 RC = &AArch64::PPRRegClass;
4631 else if (RegVT.isScalableVector())
4632 RC = &AArch64::ZPRRegClass;
4633 else
4634 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4635
4636 // Transform the arguments in physical registers into virtual ones.
4637 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
4638 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
4639
4640 // If this is an 8, 16 or 32-bit value, it is really passed promoted
4641 // to 64 bits. Insert an assert[sz]ext to capture this, then
4642 // truncate to the right size.
4643 switch (VA.getLocInfo()) {
4644 default:
4645 llvm_unreachable("Unknown loc info!");
4646 case CCValAssign::Full:
4647 break;
4649 assert(VA.getValVT().isScalableVector() &&
4650 "Only scalable vectors can be passed indirectly");
4651 break;
4652 case CCValAssign::BCvt:
4653 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
4654 break;
4655 case CCValAssign::AExt:
4656 case CCValAssign::SExt:
4657 case CCValAssign::ZExt:
4658 break;
4660 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
4661 DAG.getConstant(32, DL, RegVT));
4662 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
4663 break;
4664 }
4665 } else { // VA.isRegLoc()
4666 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
4667 unsigned ArgOffset = VA.getLocMemOffset();
4668 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
4669 ? VA.getLocVT().getSizeInBits()
4670 : VA.getValVT().getSizeInBits()) / 8;
4671
4672 uint32_t BEAlign = 0;
4673 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
4674 !Ins[i].Flags.isInConsecutiveRegs())
4675 BEAlign = 8 - ArgSize;
4676
4677 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
4678
4679 // Create load nodes to retrieve arguments from the stack.
4681
4682 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
4684 MVT MemVT = VA.getValVT();
4685
4686 switch (VA.getLocInfo()) {
4687 default:
4688 break;
4689 case CCValAssign::Trunc:
4690 case CCValAssign::BCvt:
4691 MemVT = VA.getLocVT();
4692 break;
4694 assert(VA.getValVT().isScalableVector() &&
4695 "Only scalable vectors can be passed indirectly");
4696 MemVT = VA.getLocVT();
4697 break;
4698 case CCValAssign::SExt:
4699 ExtType = ISD::SEXTLOAD;
4700 break;
4701 case CCValAssign::ZExt:
4702 ExtType = ISD::ZEXTLOAD;
4703 break;
4704 case CCValAssign::AExt:
4705 ExtType = ISD::EXTLOAD;
4706 break;
4707 }
4708
4709 ArgValue = DAG.getExtLoad(
4710 ExtType, DL, VA.getLocVT(), Chain, FIN,
4712 MemVT);
4713
4714 }
4715
4716 if (VA.getLocInfo() == CCValAssign::Indirect) {
4717 assert(VA.getValVT().isScalableVector() &&
4718 "Only scalable vectors can be passed indirectly");
4719
4720 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize();
4721 unsigned NumParts = 1;
4722 if (Ins[i].Flags.isInConsecutiveRegs()) {
4723 assert(!Ins[i].Flags.isInConsecutiveRegsLast());
4724 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
4725 ++NumParts;
4726 }
4727
4728 MVT PartLoad = VA.getValVT();
4729 SDValue Ptr = ArgValue;
4730
4731 // Ensure we generate all loads for each tuple part, whilst updating the
4732 // pointer after each load correctly using vscale.
4733 while (NumParts > 0) {
4734 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
4735 InVals.push_back(ArgValue);
4736 NumParts--;
4737 if (NumParts > 0) {
4739 DL, Ptr.getValueType(),
4741 SDNodeFlags Flags;
4742 Flags.setNoUnsignedWrap(true);
4743 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
4744 BytesIncrement, Flags);
4745 ExtraArgLocs++;
4746 i++;
4747 }
4748 }
4749 } else {
4750 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
4751 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
4752 ArgValue, DAG.getValueType(MVT::i32));
4753 InVals.push_back(ArgValue);
4754 }
4755 }
4756 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
4757
4758 // varargs
4760 if (isVarArg) {
4761 if (!Subtarget->isTargetDarwin() || IsWin64) {
4762 // The AAPCS variadic function ABI is identical to the non-variadic
4763 // one. As a result there may be more arguments in registers and we should
4764 // save them for future reference.
4765 // Win64 variadic functions also pass arguments in registers, but all float
4766 // arguments are passed in integer registers.
4767 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
4768 }
4769
4770 // This will point to the next argument passed via stack.
4771 unsigned StackOffset = CCInfo.getNextStackOffset();
4772 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
4773 StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
4774 FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
4775
4776 if (MFI.hasMustTailInVarArgFunc()) {
4778 RegParmTypes.push_back(MVT::i64);
4779 RegParmTypes.push_back(MVT::f128);
4780 // Compute the set of forwarded registers. The rest are scratch.
4782 FuncInfo->getForwardedMustTailRegParms();
4783 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
4785
4786 // Conservatively forward X8, since it might be used for aggregate return.
4787 if (!CCInfo.isAllocated(AArch64::X8)) {
4788 unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
4789 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
4790 }
4791 }
4792 }
4793
4794 // On Windows, InReg pointers must be returned, so record the pointer in a
4795 // virtual register at the start of the function so it can be returned in the
4796 // epilogue.
4797 if (IsWin64) {
4798 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
4799 if (Ins[I].Flags.isInReg()) {
4800 assert(!FuncInfo->getSRetReturnReg());
4801
4802 MVT PtrTy = getPointerTy(DAG.getDataLayout());
4803 Register Reg =
4805 FuncInfo->setSRetReturnReg(Reg);
4806
4807 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
4808 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
4809 break;
4810 }
4811 }
4812 }
4813
4814 unsigned StackArgSize = CCInfo.getNextStackOffset();
4816 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
4817 // This is a non-standard ABI so by fiat I say we're allowed to make full
4818 // use of the stack area to be popped, which must be aligned to 16 bytes in
4819 // any case:
4821
4822 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
4823 // a multiple of 16.
4825
4826 // This realignment carries over to the available bytes below. Our own
4827 // callers will guarantee the space is free by giving an aligned value to
4828 // CALLSEQ_START.
4829 }
4830 // Even if we're not expected to free up the space, it's useful to know how
4831 // much is there while considering tail calls (because we can reuse it).
4833
4834 if (Subtarget->hasCustomCallingConv())
4836
4837 return Chain;
4838}
4839
4840void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
4841 SelectionDAG &DAG,
4842 const SDLoc &DL,
4843 SDValue &Chain) const {
4845 MachineFrameInfo &MFI = MF.getFrameInfo();
4847 auto PtrVT = getPointerTy(DAG.getDataLayout());
4848 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
4849
4851
4852 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
4853 AArch64::X3, AArch64::X4, AArch64::X5,
4854 AArch64::X6, AArch64::X7 };
4855 static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
4857
4858 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
4859 int GPRIdx = 0;
4860 if (GPRSaveSize != 0) {
4861 if (IsWin64) {
4862 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
4863 if (GPRSaveSize & 15)
4864 // The extra size here, if triggered, will always be 8.
4865 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
4866 } else
4867 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
4868
4869 SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
4870
4871 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
4872 unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
4873 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
4874 SDValue Store = DAG.getStore(
4875 Val.getValue(1), DL, Val, FIN,
4876 IsWin64
4878 GPRIdx,
4879 (i - FirstVariadicGPR) * 8)
4880 : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
4881 MemOps.push_back(Store);
4882 FIN =
4883 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
4884 }
4885 }
4886 FuncInfo->setVarArgsGPRIndex(GPRIdx);
4887 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
4888
4889 if (Subtarget->hasFPARMv8() && !IsWin64) {
4890 static const MCPhysReg FPRArgRegs[] = {
4891 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
4892 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
4893 static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
4895
4896 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
4897 int FPRIdx = 0;
4898 if (FPRSaveSize != 0) {
4899 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
4900
4902
4903 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
4904 unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
4905 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
4906
4907 SDValue Store = DAG.getStore(
4908 Val.getValue(1), DL, Val, FIN,
4910 MemOps.push_back(Store);
4911 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
4912 DAG.getConstant(16, DL, PtrVT));
4913 }
4914 }
4915 FuncInfo->setVarArgsFPRIndex(FPRIdx);
4916 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
4917 }
4918
4919 if (!MemOps.empty()) {
4921 }
4922}
4923
4924/// LowerCallResult - Lower the result values of a call into the
4925/// appropriate copies out of appropriate physical registers.
4926SDValue AArch64TargetLowering::LowerCallResult(
4927 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
4928 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
4930 SDValue ThisVal) const {
4932 // Assign locations to each value returned by this call.
4935 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
4936 *DAG.getContext());
4937 CCInfo.AnalyzeCallResult(Ins, RetCC);
4938
4939 // Copy all of the result registers out of their specified physreg.
4940 for (unsigned i = 0; i != RVLocs.size(); ++i) {
4942
4943 // Pass 'this' value directly from the argument to return value, to avoid
4944 // reg unit interference
4945 if (i == 0 && isThisReturn) {
4946 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
4947 "unexpected return calling convention register assignment");
4948 InVals.push_back(ThisVal);
4949 continue;
4950 }
4951
4952 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
4953 // allows one use of a physreg per block.
4954 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
4955 if (!Val) {
4956 Val =
4957 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
4958 Chain = Val.getValue(1);
4959 InFlag = Val.getValue(2);
4960 CopiedRegs[VA.getLocReg()] = Val;
4961 }
4962
4963 switch (VA.getLocInfo()) {
4964 default:
4965 llvm_unreachable("Unknown loc info!");
4966 case CCValAssign::Full:
4967 break;
4968 case CCValAssign::BCvt:
4969 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
4970 break;
4972 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
4973 DAG.getConstant(32, DL, VA.getLocVT()));
4975 case CCValAssign::AExt:
4977 case CCValAssign::ZExt:
4978 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
4979 break;
4980 }
4981
4982 InVals.push_back(Val);
4983 }
4984
4985 return Chain;
4986}
4987
4988/// Return true if the calling convention is one that we can guarantee TCO for.
4990 return CC == CallingConv::Fast;
4991}
4992
4993/// Return true if we might ever do TCO for calls with this calling convention.
4995 switch (CC) {
4996 case CallingConv::C:
4999 case CallingConv::Swift:
5000 return true;
5001 default:
5002 return canGuaranteeTCO(CC);
5003 }
5004}
5005
5006bool AArch64TargetLowering::isEligibleForTailCallOptimization(
5007 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
5009 const SmallVectorImpl<SDValue> &OutVals,
5010 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
5012 return false;
5013
5015 const Function &CallerF = MF.getFunction();
5016 CallingConv::ID CallerCC = CallerF.getCallingConv();
5017
5018 // If this function uses the C calling convention but has an SVE signature,
5019 // then it preserves more registers and should assume the SVE_VectorCall CC.
5020 // The check for matching callee-saved regs will determine whether it is
5021 // eligible for TCO.
5022 if (CallerCC == CallingConv::C &&
5025
5026 bool CCMatch = CallerCC == CalleeCC;
5027
5028 // When using the Windows calling convention on a non-windows OS, we want
5029 // to back up and restore X18 in such functions; we can't do a tail call
5030 // from those functions.
5031 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
5033 return false;
5034
5035 // Byval parameters hand the function a pointer directly into the stack area
5036 // we want to reuse during a tail call. Working around this *is* possible (see
5037 // X86) but less efficient and uglier in LowerCall.
5038 for (Function::const_arg_iterator i = CallerF.arg_begin(),
5039 e = CallerF.arg_end();
5040 i != e; ++i) {
5041 if (i->hasByValAttr())
5042 return false;
5043
5044 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
5045 // In this case, it is necessary to save/restore X0 in the callee. Tail
5046 // call opt interferes with this. So we disable tail call opt when the
5047 // caller has an argument with "inreg" attribute.
5048
5049 // FIXME: Check whether the callee also has an "inreg" argument.
5050 if (i->hasInRegAttr())
5051 return false;
5052 }
5053
5055 return canGuaranteeTCO(CalleeCC) && CCMatch;
5056
5057 // Externally-defined functions with weak linkage should not be
5058 // tail-called on AArch64 when the OS does not support dynamic
5059 // pre-emption of symbols, as the AAELF spec requires normal calls
5060 // to undefined weak functions to be replaced with a NOP or jump to the
5061 // next instruction. The behaviour of branch instructions in this
5062 // situation (as used for tail calls) is implementation-defined, so we
5063 // cannot rely on the linker replacing the tail call with a return.
5065 const GlobalValue *GV = G->getGlobal();
5067 if (GV->hasExternalWeakLinkage() &&
5068 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
5069 return false;
5070 }
5071
5072 // Now we search for cases where we can use a tail call without changing the
5073 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
5074 // concept.
5075
5076 // I want anyone implementing a new calling convention to think long and hard
5077 // about this assert.
5078 assert((!isVarArg || CalleeCC == CallingConv::C) &&
5079 "Unexpected variadic calling convention");
5080
5081 LLVMContext &C = *DAG.getContext();
5082 if (isVarArg && !Outs.empty()) {
5083 // At least two cases here: if caller is fastcc then we can't have any
5084 // memory arguments (we'd be expected to clean up the stack afterwards). If
5085 // caller is C then we could potentially use its argument area.
5086
5087 // FIXME: for now we take the most conservative of these in both cases:
5088 // disallow all variadic memory operands.
5090 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5091
5092 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
5093 for (const CCValAssign &ArgLoc : ArgLocs)
5094 if (!ArgLoc.isRegLoc())
5095 return false;
5096 }
5097
5098 // Check that the call results are passed in the same way.
5100 CCAssignFnForCall(CalleeCC, isVarArg),
5101 CCAssignFnForCall(CallerCC, isVarArg)))
5102 return false;
5103 // The callee has to preserve all registers the caller needs to preserve.
5104 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
5105 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
5106 if (!CCMatch) {
5107 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
5108 if (Subtarget->hasCustomCallingConv()) {
5109 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
5110 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
5111 }
5112 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
5113 return false;
5114 }
5115
5116 // Nothing more to check if the callee is taking no arguments
5117 if (Outs.empty())
5118 return true;
5119
5121 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5122
5123 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
5124
5125 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
5126
5127 // If any of the arguments is passed indirectly, it must be SVE, so the
5128 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
5129 // allocate space on the stack. That is why we determine this explicitly here
5130 // the call cannot be a tailcall.
5131 if (llvm::any_of(ArgLocs, [](CCValAssign &A) {
5132 assert((A.getLocInfo() != CCValAssign::Indirect ||
5133 A.getValVT().isScalableVector()) &&
5134 "Expected value to be scalable");
5135 return A.getLocInfo() == CCValAssign::Indirect;
5136 }))
5137 return false;
5138
5139 // If the stack arguments for this call do not fit into our own save area then
5140 // the call cannot be made tail.
5141 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
5142 return false;
5143
5144 const MachineRegisterInfo &MRI = MF.getRegInfo();
5146 return false;
5147
5148 return true;
5149}
5150
5151SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
5152 SelectionDAG &DAG,
5153 MachineFrameInfo &MFI,
5154 int ClobberedFI) const {
5156 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
5157 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
5158
5159 // Include the original chain at the beginning of the list. When this is
5160 // used by target LowerCall hooks, this helps legalize find the
5161 // CALLSEQ_BEGIN node.
5162 ArgChains.push_back(Chain);
5163
5164 // Add a chain value for each stack argument corresponding
5166 UE = DAG.getEntryNode().getNode()->use_end();
5167 U != UE; ++U)
5168 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
5169 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
5170 if (FI->getIndex() < 0) {
5171 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
5172 int64_t InLastByte = InFirstByte;
5173 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
5174
5175 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
5176 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
5177 ArgChains.push_back(SDValue(L, 1));
5178 }
5179
5180 // Build a tokenfactor for all the chains.
5181 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
5182}
5183
5184bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
5185 bool TailCallOpt) const {
5187}
5188
5189/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
5190/// and add input and output parameter nodes.
5191SDValue
5192AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
5193 SmallVectorImpl<SDValue> &InVals) const {
5194 SelectionDAG &DAG = CLI.DAG;
5195 SDLoc &DL = CLI.DL;
5196 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
5197 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
5199 SDValue Chain = CLI.Chain;
5200 SDValue Callee = CLI.Callee;
5201 bool &IsTailCall = CLI.IsTailCall;
5202 CallingConv::ID CallConv = CLI.CallConv;
5203 bool IsVarArg = CLI.IsVarArg;
5204
5207 bool IsThisReturn = false;
5208
5211 bool IsSibCall = false;
5212
5213 // Check callee args/returns for SVE registers and set calling convention
5214 // accordingly.
5215 if (CallConv == CallingConv::C) {
5216 bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
5217 return Out.VT.isScalableVector();
5218 });
5219 bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
5220 return In.VT.isScalableVector();
5221 });
5222
5225 }
5226
5227 if (IsTailCall) {
5228 // Check if it's really possible to do a tail call.
5229 IsTailCall = isEligibleForTailCallOptimization(
5230 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
5231 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
5232 report_fatal_error("failed to perform tail call elimination on a call "
5233 "site marked musttail");
5234
5235 // A sibling call is one where we're under the usual C ABI and not planning
5236 // to change that but can still do a tail call:
5237 if (!TailCallOpt && IsTailCall)
5238 IsSibCall = true;
5239
5240 if (IsTailCall)
5241 ++NumTailCalls;
5242 }
5243
5244 // Analyze operands of the call, assigning locations to each operand.
5246 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
5247 *DAG.getContext());
5248
5249 if (IsVarArg) {
5250 // Handle fixed and variable vector arguments differently.
5251 // Variable vector arguments always go into memory.
5252 unsigned NumArgs = Outs.size();
5253
5254 for (unsigned i = 0; i != NumArgs; ++i) {
5255 MVT ArgVT = Outs[i].VT;
5256 if (!Outs[i].IsFixed && ArgVT.isScalableVector())
5257 report_fatal_error("Passing SVE types to variadic functions is "
5258 "currently not supported");
5259
5260 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
5261 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
5262 /*IsVarArg=*/ !Outs[i].IsFixed);
5263 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
5264 assert(!Res && "Call operand has unhandled type");
5265 (void)Res;
5266 }
5267 } else {
5268 // At this point, Outs[].VT may already be promoted to i32. To correctly
5269 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
5270 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
5271 // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
5272 // we use a special version of AnalyzeCallOperands to pass in ValVT and
5273 // LocVT.
5274 unsigned NumArgs = Outs.size();
5275 for (unsigned i = 0; i != NumArgs; ++i) {
5276 MVT ValVT = Outs[i].VT;
5277 // Get type of the original argument.
5279 CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
5280 /*AllowUnknown*/ true);
5281 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
5282 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
5283 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
5284 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
5285 ValVT = MVT::i8;
5286 else if (ActualMVT == MVT::i16)
5287 ValVT = MVT::i16;
5288
5289 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
5290 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
5291 assert(!Res && "Call operand has unhandled type");
5292 (void)Res;
5293 }
5294 }
5295
5296 // Get a count of how many bytes are to be pushed on the stack.
5297 unsigned NumBytes = CCInfo.getNextStackOffset();
5298
5299 if (IsSibCall) {
5300 // Since we're not changing the ABI to make this a tail call, the memory
5301 // operands are already available in the caller's incoming argument space.
5302 NumBytes = 0;
5303 }
5304
5305 // FPDiff is the byte offset of the call's argument area from the callee's.
5306 // Stores to callee stack arguments will be placed in FixedStackSlots offset
5307 // by this amount for a tail call. In a sibling call it must be 0 because the
5308 // caller will deallocate the entire stack and the callee still expects its
5309 // arguments to begin at SP+0. Completely unused for non-tail calls.
5310 int FPDiff = 0;
5311
5312 if (IsTailCall && !IsSibCall) {
5313 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
5314
5315 // Since callee will pop argument stack as a tail call, we must keep the
5316 // popped size 16-byte aligned.
5317 NumBytes = alignTo(NumBytes, 16);
5318
5319 // FPDiff will be negative if this tail call requires more space than we
5320 // would automatically have in our incoming argument space. Positive if we
5321 // can actually shrink the stack.
5322 FPDiff = NumReusableBytes - NumBytes;
5323
5324 // The stack pointer must be 16-byte aligned at all times it's used for a
5325 // memory operation, which in practice means at *all* times and in
5326 // particular across call boundaries. Therefore our own arguments started at
5327 // a 16-byte aligned SP and the delta applied for the tail call should
5328 // satisfy the same constraint.
5329 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
5330 }
5331
5332 // Adjust the stack pointer for the new arguments...
5333 // These operations are automatically eliminated by the prolog/epilog pass
5334 if (!IsSibCall)
5335 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
5336
5337 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
5339
5341 SmallSet<unsigned, 8> RegsUsed;
5343 auto PtrVT = getPointerTy(DAG.getDataLayout());
5344
5345 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
5346 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
5347 for (const auto &F : Forwards) {
5348 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
5349 RegsToPass.emplace_back(F.PReg, Val);
5350 }
5351 }
5352
5353 // Walk the register/memloc assignments, inserting copies/loads.
5354 unsigned ExtraArgLocs = 0;
5355 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
5357 SDValue Arg = OutVals[i];
5358 ISD::ArgFlagsTy Flags = Outs[i].Flags;
5359
5360 // Promote the value if needed.
5361 switch (VA.getLocInfo()) {
5362 default:
5363 llvm_unreachable("Unknown loc info!");
5364 case CCValAssign::Full:
5365 break;
5366 case CCValAssign::SExt:
5367 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
5368 break;
5369 case CCValAssign::ZExt:
5370 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
5371 break;
5372 case CCValAssign::AExt:
5373 if (Outs[i].ArgVT == MVT::i1) {
5374 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
5377 }
5378 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
5379 break;
5381 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
5382 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
5383 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
5384 DAG.getConstant(32, DL, VA.getLocVT()));
5385 break;
5386 case CCValAssign::BCvt:
5387 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
5388 break;
5389 case CCValAssign::Trunc:
5390 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
5391 break;
5392 case CCValAssign::FPExt:
5393 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
5394 break;
5396 assert(VA.getValVT().isScalableVector() &&
5397 "Only scalable vectors can be passed indirectly");
5398
5399 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize();
5400 uint64_t PartSize = StoreSize;
5401 unsigned NumParts = 1;
5402 if (Outs[i].Flags.isInConsecutiveRegs()) {
5403 assert(!Outs[i].Flags.isInConsecutiveRegsLast());
5404 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
5405 ++NumParts;
5406 StoreSize *= NumParts;
5407 }
5408
5410 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
5411 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
5412 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
5414
5415 MachinePointerInfo MPI =
5417 SDValue Ptr = DAG.getFrameIndex(
5419 SDValue SpillSlot = Ptr;
5420
5421 // Ensure we generate all stores for each tuple part, whilst updating the
5422 // pointer after each store correctly using vscale.
5423 while (NumParts) {
5424 Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
5425 NumParts--;
5426 if (NumParts > 0) {
5428 DL, Ptr.getValueType(),
5430 SDNodeFlags Flags;
5431 Flags.setNoUnsignedWrap(true);
5432
5433 MPI = MachinePointerInfo(MPI.getAddrSpace());
5434 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
5435 BytesIncrement, Flags);
5436 ExtraArgLocs++;
5437 i++;
5438 }
5439 }
5440
5441 Arg = SpillSlot;
5442 break;
5443 }
5444
5445 if (VA.isRegLoc()) {
5446 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
5447 Outs[0].VT == MVT::i64) {
5448 assert(VA.getLocVT() == MVT::i64 &&
5449 "unexpected calling convention register assignment");
5450 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
5451 "unexpected use of 'returned'");
5452 IsThisReturn = true;
5453 }
5454 if (RegsUsed.count(VA.getLocReg())) {
5455 // If this register has already been used then we're trying to pack
5456 // parts of an [N x i32] into an X-register. The extension type will
5457 // take care of putting the two halves in the right place but we have to
5458 // combine them.
5459 SDValue &Bits =
5461 [=](const std::pair<unsigned, SDValue> &Elt) {
5462 return Elt.first == VA.getLocReg();
5463 })
5464 ->second;
5465 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
5466 // Call site info is used for function's parameter entry value
5467 // tracking. For now we track only simple cases when parameter
5468 // is transferred through whole register.
5470 return ArgReg.Reg == VA.getLocReg();
5471 });
5472 } else {
5473 RegsToPass.emplace_back(VA.getLocReg(), Arg);
5474 RegsUsed.insert(VA.getLocReg());
5475 const TargetOptions &Options = DAG.getTarget().Options;
5476 if (Options.EmitCallSiteInfo)
5477 CSInfo.emplace_back(VA.getLocReg(), i);
5478 }
5479 } else {
5480 assert(VA.isMemLoc());
5481
5484
5485 // FIXME: This works on big-endian for composite byvals, which are the
5486 // common case. It should also work for fundamental types too.
5487 uint32_t BEAlign = 0;
5488 unsigned OpSize;
5489 if (VA.getLocInfo() == CCValAssign::Indirect)
5490 OpSize = VA.getLocVT().getFixedSizeInBits();
5491 else
5492 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
5493 : VA.getValVT().getSizeInBits();
5494 OpSize = (OpSize + 7) / 8;
5495 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
5496 !Flags.isInConsecutiveRegs()) {
5497 if (OpSize < 8)
5498 BEAlign = 8 - OpSize;
5499 }
5500 unsigned LocMemOffset = VA.getLocMemOffset();
5501 int32_t Offset = LocMemOffset + BEAlign;
5503 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
5504
5505 if (IsTailCall) {
5506 Offset = Offset + FPDiff;
5507 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
5508
5509 DstAddr = DAG.getFrameIndex(FI, PtrVT);
5510 DstInfo =
5512
5513 // Make sure any stack arguments overlapping with where we're storing
5514 // are loaded before this eventual operation. Otherwise they'll be
5515 // clobbered.
5516 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
5517 } else {
5519
5520 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
5522 LocMemOffset);
5523 }
5524
5525 if (Outs[i].Flags.isByVal()) {
5527 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
5528 SDValue Cpy = DAG.getMemcpy(
5529 Chain, DL, DstAddr, Arg, SizeNode,
5530 Outs[i].Flags.getNonZeroByValAlign(),
5531 /*isVol = */ false, /*AlwaysInline = */ false,
5532 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
5533
5534 MemOpChains.push_back(Cpy);
5535 } else {
5536 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
5537 // promoted to a legal register type i32, we should truncate Arg back to
5538 // i1/i8/i16.
5539 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
5540 VA.getValVT() == MVT::i16)
5541 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
5542
5543 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
5544 MemOpChains.push_back(Store);
5545 }
5546 }
5547 }
5548
5549 if (!MemOpChains.empty())
5551
5552 // Build a sequence of copy-to-reg nodes chained together with token chain
5553 // and flag operands which copy the outgoing args into the appropriate regs.
5555 for (auto &RegToPass : RegsToPass) {
5556 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
5557 RegToPass.second, InFlag);
5558 InFlag = Chain.getValue(1);
5559 }
5560
5561 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
5562 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
5563 // node so that legalize doesn't hack it.
5564 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
5565 auto GV = G->getGlobal();
5566 unsigned OpFlags =
5568 if (OpFlags & AArch64II::MO_GOT) {
5571 } else {
5572 const GlobalValue *GV = G->getGlobal();
5573 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
5574 }
5575 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
5577 Subtarget->isTargetMachO()) {
5578 const char *Sym = S->getSymbol();
5581 } else {
5582 const char *Sym = S->getSymbol();
5583 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
5584 }
5585 }
5586
5587 // We don't usually want to end the call-sequence here because we would tidy
5588 // the frame up *after* the call, however in the ABI-changing tail-call case
5589 // we've carefully laid out the parameters so that when sp is reset they'll be
5590 // in the correct location.
5591 if (IsTailCall && !IsSibCall) {
5592 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
5593 DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
5594 InFlag = Chain.getValue(1);
5595 }
5596
5597 std::vector<SDValue> Ops;
5598 Ops.push_back(Chain);
5599 Ops.push_back(Callee);
5600
5601 if (IsTailCall) {
5602 // Each tail call may have to adjust the stack by a different amount, so
5603 // this information must travel along with the operation for eventual
5604 // consumption by emitEpilogue.
5605 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
5606 }
5607
5608 // Add argument registers to the end of the list so that they are known live
5609 // into the call.
5610 for (auto &RegToPass : RegsToPass)
5611 Ops.push_back(DAG.getRegister(RegToPass.first,
5612 RegToPass.second.getValueType()));
5613
5614 // Add a register mask operand representing the call-preserved registers.
5615 const uint32_t *Mask;
5616 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
5617 if (IsThisReturn) {
5618 // For 'this' returns, use the X0-preserving mask if applicable
5619 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
5620 if (!Mask) {
5621 IsThisReturn = false;
5622 Mask = TRI->getCallPreservedMask(MF, CallConv);
5623 }
5624 } else
5625 Mask = TRI->getCallPreservedMask(MF, CallConv);
5626
5627 if (Subtarget->hasCustomCallingConv())
5628 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
5629
5630 if (TRI->isAnyArgRegReserved(MF))
5631 TRI->emitReservedArgRegCallError(MF);
5632
5633 assert(Mask && "Missing call preserved mask for calling convention");
5634 Ops.push_back(DAG.getRegisterMask(Mask));
5635
5636 if (InFlag.getNode())
5637 Ops.push_back(InFlag);
5638
5640
5641 // If we're doing a tall call, use a TC_RETURN here rather than an
5642 // actual call instruction.
5643 if (IsTailCall) {
5646 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
5647 return Ret;
5648 }
5649
5650 unsigned CallOpc = AArch64ISD::CALL;
5651 // Calls marked with "rv_marker" are special. They should be expanded to the
5652 // call, directly followed by a special marker sequence. Use the CALL_RVMARKER
5653 // to do that.
5654 if (CLI.CB && CLI.CB->hasRetAttr("rv_marker")) {
5655 assert(!IsTailCall && "tail calls cannot be marked with rv_marker");
5657 }
5658
5659 // Returns a chain and a flag for retval copy to use.
5660 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
5661 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
5662 InFlag = Chain.getValue(1);
5663 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
5664
5665 uint64_t CalleePopBytes =
5666 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
5667
5668 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
5670 InFlag, DL);
5671 if (!Ins.empty())
5672 InFlag = Chain.getValue(1);
5673
5674 // Handle result values, copying them out of physregs into vregs that we
5675 // return.
5676 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
5677 InVals, IsThisReturn,
5678 IsThisReturn ? OutVals[0] : SDValue());
5679}
5680
5681bool AArch64TargetLowering::CanLowerReturn(
5682 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
5683 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
5686 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
5687 return CCInfo.CheckReturn(Outs, RetCC);
5688}
5689
5690SDValue
5691AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
5692 bool isVarArg,
5694 const SmallVectorImpl<SDValue> &OutVals,
5695 const SDLoc &DL, SelectionDAG &DAG) const {
5696 auto &MF = DAG.getMachineFunction();
5697 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
5698
5701 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
5702 *DAG.getContext());
5703 CCInfo.AnalyzeReturn(Outs, RetCC);
5704
5705 // Copy the result values into the output registers.
5706 SDValue Flag;
5708 SmallSet<unsigned, 4> RegsUsed;
5709 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
5710 ++i, ++realRVLocIdx) {
5711 CCValAssign &VA = RVLocs[i];
5712 assert(VA.isRegLoc() && "Can only return in registers!");
5713 SDValue Arg = OutVals[realRVLocIdx];
5714
5715 switch (VA.getLocInfo()) {
5716 default:
5717 llvm_unreachable("Unknown loc info!");
5718 case CCValAssign::Full:
5719 if (Outs[i].ArgVT == MVT::i1) {
5720 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
5721 // value. This is strictly redundant on Darwin (which uses "zeroext
5722 // i1"), but will be optimised out before ISel.
5724 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
5725 }
5726 break;
5727 case CCValAssign::BCvt:
5728 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
5729 break;
5730 case CCValAssign::AExt:
5731 case CCValAssign::ZExt:
5732 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
5733 break;
5735 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
5736 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
5737 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
5738 DAG.getConstant(32, DL, VA.getLocVT()));
5739 break;
5740 }
5741
5742 if (RegsUsed.count(VA.getLocReg())) {
5743 SDValue &Bits =
5744 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
5745 return Elt.first == VA.getLocReg();
5746 })->second;
5747 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
5748 } else {
5749 RetVals.emplace_back(VA.getLocReg(), Arg);
5750 RegsUsed.insert(VA.getLocReg());
5751 }
5752 }
5753
5755 for (auto &RetVal : RetVals) {
5756 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
5757 Flag = Chain.getValue(1);
5758 RetOps.push_back(
5759 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
5760 }
5761
5762 // Windows AArch64 ABIs require that for returning structs by value we copy
5763 // the sret argument into X0 for the return.
5764 // We saved the argument into a virtual register in the entry block,
5765 // so now we copy the value out and into X0.
5766 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
5767 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
5769
5770 unsigned RetValReg = AArch64::X0;
5771 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
5772 Flag = Chain.getValue(1);
5773
5774 RetOps.push_back(
5776 }
5777
5778 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
5779 const MCPhysReg *I =
5780 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
5781 if (I) {
5782 for (; *I; ++I) {
5783 if (AArch64::GPR64RegClass.contains(*I))
5784 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
5785 else if (AArch64::FPR64RegClass.contains(*I))
5786 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
5787 else
5788 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
5789 }
5790 }
5791
5792 RetOps[0] = Chain; // Update chain.
5793
5794 // Add the flag if we have it.
5795 if (Flag.getNode())
5796 RetOps.push_back(Flag);
5797
5799}
5800
5801//===----------------------------------------------------------------------===//
5802// Other Lowering Code
5803//===----------------------------------------------------------------------===//
5804
5805SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
5806 SelectionDAG &DAG,
5807 unsigned Flag) const {
5808 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
5809 N->getOffset(), Flag);
5810}
5811
5812SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
5813 SelectionDAG &DAG,
5814 unsigned Flag) const {
5815 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
5816}
5817
5818SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
5819 SelectionDAG &DAG,
5820 unsigned Flag) const {
5821 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
5822 N->getOffset(), Flag);
5823}
5824
5825SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
5826 SelectionDAG &DAG,
5827 unsigned Flag) const {
5828 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
5829}
5830
5831// (loadGOT sym)
5832template <class NodeTy>
5833SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
5834 unsigned Flags) const {
5835 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
5836 SDLoc DL(N);
5837 EVT Ty = getPointerTy(DAG.getDataLayout());
5838 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
5839 // FIXME: Once remat is capable of dealing with instructions with register
5840 // operands, expand this into two nodes instead of using a wrapper node.
5841 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
5842}
5843
5844// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
5845template <class NodeTy>
5846SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
5847 unsigned Flags) const {
5848 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
5849 SDLoc DL(N);
5850 EVT Ty = getPointerTy(DAG.getDataLayout());
5851 const unsigned char MO_NC = AArch64II::MO_NC;
5852 return DAG.getNode(
5854 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
5855 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
5856 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
5857 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
5858}
5859
5860// (addlow (adrp %hi(sym)) %lo(sym))
5861template <class NodeTy>
5862SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
5863 unsigned Flags) const {
5864 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
5865 SDLoc DL(N);
5866 EVT Ty = getPointerTy(DAG.getDataLayout());
5867 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
5868 SDValue Lo = getTargetNode(N, Ty, DAG,
5870 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
5871 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
5872}
5873
5874// (adr sym)
5875template <class NodeTy>
5876SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
5877 unsigned Flags) const {
5878 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
5879 SDLoc DL(N);
5880 EVT Ty = getPointerTy(DAG.getDataLayout());
5881 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
5882 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
5883}
5884
5885SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
5886 SelectionDAG &DAG) const {
5888 const GlobalValue *GV = GN->getGlobal();
5889 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
5890
5893 "unexpected offset in global node");
5894
5895 // This also catches the large code model case for Darwin, and tiny code
5896 // model with got relocations.
5897 if ((OpFlags & AArch64II::MO_GOT) != 0) {
5898 return getGOT(GN, DAG, OpFlags);
5899 }
5900
5903 Result = getAddrLarge(GN, DAG, OpFlags);
5904 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
5905 Result = getAddrTiny(GN, DAG, OpFlags);
5906 } else {
5907 Result = getAddr(GN, DAG, OpFlags);
5908 }
5910 SDLoc DL(GN);
5912 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
5914 return Result;
5915}
5916
5917/// Convert a TLS address reference into the correct sequence of loads
5918/// and calls to compute the variable's address (for Darwin, currently) and
5919/// return an SDValue containing the final node.
5920
5921/// Darwin only has one TLS scheme which must be capable of dealing with the
5922/// fully general situation, in the worst case. This means:
5923/// + "extern __thread" declaration.
5924/// + Defined in a possibly unknown dynamic library.
5925///
5926/// The general system is that each __thread variable has a [3 x i64] descriptor
5927/// which contains information used by the runtime to calculate the address. The
5928/// only part of this the compiler needs to know about is the first xword, which
5929/// contains a function pointer that must be called with the address of the
5930/// entire descriptor in "x0".
5931///
5932/// Since this descriptor may be in a different unit, in general even the
5933/// descriptor must be accessed via an indirect load. The "ideal" code sequence
5934/// is:
5935/// adrp x0, _var@TLVPPAGE
5936/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
5937/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
5938/// ; the function pointer
5939/// blr x1 ; Uses descriptor address in x0
5940/// ; Address of _var is now in x0.
5941///
5942/// If the address of _var's descriptor *is* known to the linker, then it can
5943/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
5944/// a slight efficiency gain.
5945SDValue
5946AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
5947 SelectionDAG &DAG) const {
5948 assert(Subtarget->isTargetDarwin() &&
5949 "This function expects a Darwin target");
5950
5951 SDLoc DL(Op);
5954 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
5955
5959
5960 // The first entry in the descriptor is a function pointer that we must call
5961 // to obtain the address of the variable.
5962 SDValue Chain = DAG.getEntryNode();
5964 PtrMemVT, DL, Chain, DescAddr,
5966 Align(PtrMemVT.getSizeInBits() / 8),
5968 Chain = FuncTLVGet.getValue(1);
5969
5970 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
5972
5974 MFI.setAdjustsStack(true);
5975
5976 // TLS calls preserve all registers except those that absolutely must be
5977 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
5978 // silly).
5979 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
5980 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
5981 if (Subtarget->hasCustomCallingConv())
5982 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
5983
5984 // Finally, we can make the call. This is just a degenerate version of a
5985 // normal AArch64 call node: x0 takes the address of the descriptor, and
5986 // returns the address of the variable in this thread.
5987 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
5988 Chain =
5990 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
5991 DAG.getRegisterMask(Mask), Chain.getValue(1));
5992 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
5993}
5994
5995/// Convert a thread-local variable reference into a sequence of instructions to
5996/// compute the variable's address for the local exec TLS model of ELF targets.
5997/// The sequence depends on the maximum TLS area size.
5998SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
6000 const SDLoc &DL,
6001 SelectionDAG &DAG) const {
6004
6005 switch (DAG.getTarget().Options.TLSSize) {
6006 default:
6007 llvm_unreachable("Unexpected TLS size");
6008
6009 case 12: {
6010 // mrs x0, TPIDR_EL0
6011 // add x0, x0, :tprel_lo12:a
6014 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
6015 Var,
6016 DAG.getTargetConstant(0, DL, MVT::i32)),
6017 0);
6018 }
6019
6020 case 24: {
6021 // mrs x0, TPIDR_EL0
6022 // add x0, x0, :tprel_hi12:a
6023 // add x0, x0, :tprel_lo12_nc:a
6027 GV, DL, PtrVT, 0,
6029 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
6030 HiVar,
6031 DAG.getTargetConstant(0, DL, MVT::i32)),
6032 0);
6033 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
6034 LoVar,
6035 DAG.getTargetConstant(0, DL, MVT::i32)),
6036 0);
6037 }
6038
6039 case 32: {
6040 // mrs x1, TPIDR_EL0
6041 // movz x0, #:tprel_g1:a
6042 // movk x0, #:tprel_g0_nc:a
6043 // add x0, x1, x0
6047 GV, DL, PtrVT, 0,
6049 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
6050 DAG.getTargetConstant(16, DL, MVT::i32)),
6051 0);
6052 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
6053 DAG.getTargetConstant(0, DL, MVT::i32)),
6054 0);
6055 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
6056 }
6057
6058 case 48: {
6059 // mrs x1, TPIDR_EL0
6060 // movz x0, #:tprel_g2:a
6061 // movk x0, #:tprel_g1_nc:a
6062 // movk x0, #:tprel_g0_nc:a
6063 // add x0, x1, x0
6067 GV, DL, PtrVT, 0,
6070 GV, DL, PtrVT, 0,
6072 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
6073 DAG.getTargetConstant(32, DL, MVT::i32)),
6074 0);
6075 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
6076 DAG.getTargetConstant(16, DL, MVT::i32)),
6077 0);
6078 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
6079 DAG.getTargetConstant(0, DL, MVT::i32)),
6080 0);
6081 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
6082 }
6083 }
6084}
6085
6086/// When accessing thread-local variables under either the general-dynamic or
6087/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
6088/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
6089/// is a function pointer to carry out the resolution.
6090///
6091/// The sequence is:
6092/// adrp x0, :tlsdesc:var
6093/// ldr x1, [x0, #:tlsdesc_lo12:var]
6094/// add x0, x0, #:tlsdesc_lo12:var
6095/// .tlsdesccall var
6096/// blr x1
6097/// (TPIDR_EL0 offset now in x0)
6098///
6099/// The above sequence must be produced unscheduled, to enable the linker to
6100/// optimize/relax this sequence.
6101/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
6102/// above sequence, and expanded really late in the compilation flow, to ensure
6103/// the sequence is produced as per above.
6104SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
6105 const SDLoc &DL,
6106 SelectionDAG &DAG) const {
6108
6109 SDValue Chain = DAG.getEntryNode();
6111
6112 Chain =
6114 SDValue Glue = Chain.getValue(1);
6115
6116 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
6117}
6118
6119SDValue
6120AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
6121 SelectionDAG &DAG) const {
6122 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
6123
6125
6126 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
6127
6129 if (Model == TLSModel::LocalDynamic)
6131 }
6132
6134 Model != TLSModel::LocalExec)
6135 report_fatal_error("ELF TLS only supported in small memory model or "
6136 "in local exec TLS model");
6137 // Different choices can be made for the maximum size of the TLS area for a
6138 // module. For the small address model, the default TLS size is 16MiB and the
6139 // maximum TLS size is 4GiB.
6140 // FIXME: add tiny and large code model support for TLS access models other
6141 // than local exec. We currently generate the same code as small for tiny,
6142 // which may be larger than needed.
6143
6144 SDValue TPOff;
6146 SDLoc DL(Op);
6147 const GlobalValue *GV = GA->getGlobal();
6148
6150
6151 if (Model == TLSModel::LocalExec) {
6152 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
6153 } else if (Model == TLSModel::InitialExec) {
6156 } else if (Model == TLSModel::LocalDynamic) {
6157 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
6158 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
6159 // the beginning of the module's TLS region, followed by a DTPREL offset
6160 // calculation.
6161
6162 // These accesses will need deduplicating if there's more than one.
6163 AArch64FunctionInfo *MFI =
6166
6167 // The call needs a relocation too for linker relaxation. It doesn't make
6168 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
6169 // the address.
6170 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
6172
6173 // Now we can calculate the offset from TPIDR_EL0 to this module's
6174 // thread-local area.
6175 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
6176
6177 // Now use :dtprel_whatever: operations to calculate this variable's offset
6178 // in its thread-storage area.
6182 GV, DL, MVT::i64, 0,
6184
6185 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
6186 DAG.getTargetConstant(0, DL, MVT::i32)),
6187 0);
6188 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
6189 DAG.getTargetConstant(0, DL, MVT::i32)),
6190 0);
6191 } else if (Model == TLSModel::GeneralDynamic) {
6192 // The call needs a relocation too for linker relaxation. It doesn't make
6193 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
6194 // the address.
6197
6198 // Finally we can make a call to calculate the offset from tpidr_el0.
6199 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
6200 } else
6201 llvm_unreachable("Unsupported ELF TLS access model");
6202
6203 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
6204}
6205
6206SDValue
6207AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
6208 SelectionDAG &DAG) const {
6209 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
6210
6211 SDValue Chain = DAG.getEntryNode();
6213 SDLoc DL(Op);
6214
6215 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
6216
6217 // Load the ThreadLocalStoragePointer from the TEB
6218 // A pointer to the TLS array is located at offset 0x58 from the TEB.
6220 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
6222 Chain = TLSArray.getValue(1);
6223
6224 // Load the TLS index from the C runtime;
6225 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
6226 // This also does the same as LOADgot, but using a generic i32 load,
6227 // while LOADgot only loads i64.
6236 Chain = TLSIndex.getValue(1);
6237
6238 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
6239 // offset into the TLSArray.
6242 DAG.getConstant(3, DL, PtrVT));
6243 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
6244 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
6246 Chain = TLS.getValue(1);
6247
6249 const GlobalValue *GV = GA->getGlobal();
6253 GV, DL, PtrVT, 0,
6255
6256 // Add the offset from the start of the .tls section (section base).
6257 SDValue Addr =
6258 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
6259 DAG.getTargetConstant(0, DL, MVT::i32)),
6260 0);
6262 return Addr;
6263}
6264
6265SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
6266 SelectionDAG &DAG) const {
6268 if (DAG.getTarget().useEmulatedTLS())
6269 return LowerToTLSEmulatedModel(GA, DAG);
6270
6271 if (Subtarget->isTargetDarwin())
6272 return LowerDarwinGlobalTLSAddress(Op, DAG);
6273 if (Subtarget->isTargetELF())
6274 return LowerELFGlobalTLSAddress(Op, DAG);
6275 if (Subtarget->isTargetWindows())
6276 return LowerWindowsGlobalTLSAddress(Op, DAG);
6277
6278 llvm_unreachable("Unexpected platform trying to use TLS");
6279}
6280
6281// Looks through \param Val to determine the bit that can be used to
6282// check the sign of the value. It returns the unextended value and
6283// the sign bit position.
6284std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
6285 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
6286 return {Val.getOperand(0),
6287 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
6288 1};
6289
6290 if (Val.getOpcode() == ISD::SIGN_EXTEND)
6291 return {Val.getOperand(0),
6292 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
6293
6294 return {Val, Val.getValueSizeInBits() - 1};
6295}
6296
6297SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
6298 SDValue Chain = Op.getOperand(0);
6299 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
6300 SDValue LHS = Op.getOperand(2);
6301 SDValue RHS = Op.getOperand(3);
6302 SDValue Dest = Op.getOperand(4);
6303 SDLoc dl(Op);
6304
6306 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
6307 // will not be produced, as they are conditional branch instructions that do
6308 // not set flags.
6309 bool ProduceNonFlagSettingCondBr =
6310 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
6311
6312 // Handle f128 first, since lowering it will result in comparing the return
6313 // value of a libcall against zero, which is just what the rest of LowerBR_CC
6314 // is expecting to deal with.
6315 if (LHS.getValueType() == MVT::f128) {
6316 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
6317
6318 // If softenSetCCOperands returned a scalar, we need to compare the result
6319 // against zero to select between true and false values.
6320 if (!RHS.getNode()) {
6321 RHS = DAG.getConstant(0, dl, LHS.getValueType());
6322 CC = ISD::SETNE;
6323 }
6324 }
6325
6326 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
6327 // instruction.
6328 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
6329 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
6330 // Only lower legal XALUO ops.
6331 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
6332 return SDValue();
6333
6334 // The actual operation with overflow check.
6337 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
6338
6339 if (CC == ISD::SETNE)
6340 OFCC = getInvertedCondCode(OFCC);
6341 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
6342
6343 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6344 Overflow);
6345 }
6346
6347 if (LHS.getValueType().isInteger()) {
6348 assert((LHS.getValueType() == RHS.getValueType()) &&
6349 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
6350
6351 // If the RHS of the comparison is zero, we can potentially fold this
6352 // to a specialized branch.
6354 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
6355 if (CC == ISD::SETEQ) {
6356 // See if we can use a TBZ to fold in an AND as well.
6357 // TBZ has a smaller branch displacement than CBZ. If the offset is
6358 // out of bounds, a late MI-layer pass rewrites branches.
6359 // 403.gcc is an example that hits this case.
6360 if (LHS.getOpcode() == ISD::AND &&
6361 isa<ConstantSDNode>(LHS.getOperand(1)) &&
6362 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
6363 SDValue Test = LHS.getOperand(0);
6364 uint64_t Mask = LHS.getConstantOperandVal(1);
6365 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
6366 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
6367 Dest);
6368 }
6369
6370 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
6371 } else if (CC == ISD::SETNE) {
6372 // See if we can use a TBZ to fold in an AND as well.
6373 // TBZ has a smaller branch displacement than CBZ. If the offset is
6374 // out of bounds, a late MI-layer pass rewrites branches.
6375 // 403.gcc is an example that hits this case.
6376 if (LHS.getOpcode() == ISD::AND &&
6377 isa<ConstantSDNode>(LHS.getOperand(1)) &&
6378 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
6379 SDValue Test = LHS.getOperand(0);
6380 uint64_t Mask = LHS.getConstantOperandVal(1);
6381 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
6382 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
6383 Dest);
6384 }
6385
6386 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
6387 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
6388 // Don't combine AND since emitComparison converts the AND to an ANDS
6389 // (a.k.a. TST) and the test in the test bit and branch instruction
6390 // becomes redundant. This would also increase register pressure.
6391 uint64_t SignBitPos;
6392 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
6393 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
6394 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
6395 }
6396 }
6397 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
6398 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
6399 // Don't combine AND since emitComparison converts the AND to an ANDS
6400 // (a.k.a. TST) and the test in the test bit and branch instruction
6401 // becomes redundant. This would also increase register pressure.
6402 uint64_t SignBitPos;
6403 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
6404 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
6405 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
6406 }
6407
6408 SDValue CCVal;
6409 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
6410 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6411 Cmp);
6412 }
6413
6414 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
6415 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
6416
6417 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
6418 // clean. Some of them require two branches to implement.
6419 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
6423 SDValue BR1 =
6424 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
6425 if (CC2 != AArch64CC::AL) {
6427 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
6428 Cmp);
6429 }
6430
6431 return BR1;
6432}
6433
6434SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
6435 SelectionDAG &DAG) const {
6436 EVT VT = Op.getValueType();
6437 SDLoc DL(Op);
6438
6439 SDValue In1 = Op.getOperand(0);
6440 SDValue In2 = Op.getOperand(1);
6441 EVT SrcVT = In2.getValueType();
6442
6443 if (SrcVT.bitsLT(VT))
6444 In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
6445 else if (SrcVT.bitsGT(VT))
6446 In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
6447
6448 EVT VecVT;
6449 uint64_t EltMask;
6451
6452 auto setVecVal = [&] (int Idx) {
6453 if (!VT.isVector()) {
6454 VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
6455 DAG.getUNDEF(VecVT), In1);
6456 VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
6457 DAG.getUNDEF(VecVT), In2);
6458 } else {
6459 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
6460 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
6461 }
6462 };
6463
6464 if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
6465 VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
6466 EltMask = 0x80000000ULL;
6467 setVecVal(AArch64::ssub);
6468 } else if (VT == MVT::f64 || VT == MVT::v2f64) {
6469 VecVT = MVT::v2i64;
6470
6471 // We want to materialize a mask with the high bit set, but the AdvSIMD
6472 // immediate moves cannot materialize that in a single instruction for
6473 // 64-bit elements. Instead, materialize zero and then negate it.
6474 EltMask = 0;
6475
6476 setVecVal(AArch64::dsub);
6477 } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) {
6478 VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
6479 EltMask = 0x8000ULL;
6480 setVecVal(AArch64::hsub);
6481 } else {
6482 llvm_unreachable("Invalid type for copysign!");
6483 }
6484
6485 SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);
6486
6487 // If we couldn't materialize the mask above, then the mask vector will be
6488 // the zero vector, and we need to negate it here.
6489 if (VT == MVT::f64 || VT == MVT::v2f64) {
6493 }
6494
6495 SDValue Sel =
6497
6498 if (VT == MVT::f16)
6499 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
6500 if (VT == MVT::f32)
6501 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
6502 else if (VT == MVT::f64)
6503 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
6504 else
6505 return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
6506}
6507
6508SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
6510 Attribute::NoImplicitFloat))
6511 return SDValue();
6512
6513 if (!Subtarget->hasNEON())
6514 return SDValue();
6515
6516 // While there is no integer popcount instruction, it can
6517 // be more efficiently lowered to the following sequence that uses
6518 // AdvSIMD registers/instructions as long as the copies to/from
6519 // the AdvSIMD registers are cheap.
6520 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
6521 // CNT V0.8B, V0.8B // 8xbyte pop-counts
6522 // ADDV B0, V0.8B // sum 8xbyte pop-counts
6523 // UMOV X0, V0.B[0] // copy byte result back to integer reg
6524 SDValue Val = Op.getOperand(0);
6525 SDLoc DL(Op);
6526 EVT VT = Op.getValueType();
6527
6528 if (VT == MVT::i32 || VT == MVT::i64) {
6529 if (VT == MVT::i32)
6530 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
6531 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
6532
6534 SDValue UaddLV = DAG.getNode(
6536 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
6537
6538 if (VT == MVT::i64)
6540 return UaddLV;
6541 } else if (VT == MVT::i128) {
6542 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
6543
6545 SDValue UaddLV = DAG.getNode(
6547 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
6548
6550 }
6551
6552 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
6553 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
6554
6555 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6556 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6557 "Unexpected type for custom ctpop lowering");
6558
6560 Val = DAG.getBitcast(VT8Bit, Val);
6561 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
6562
6563 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6564 unsigned EltSize = 8;
6565 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6566 while (EltSize != VT.getScalarSizeInBits()) {
6567 EltSize *= 2;
6568 NumElts /= 2;
6570 Val = DAG.getNode(
6572 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
6573 }
6574
6575 return Val;
6576}
6577
6578SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
6579 EVT VT = Op.getValueType();
6580 assert(VT.isScalableVector() ||
6581 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true));
6582
6583 SDLoc DL(Op);
6584 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
6585 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
6586}
6587
6588SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
6589
6590 if (Op.getValueType().isVector())
6591 return LowerVSETCC(Op, DAG);
6592
6593 bool IsStrict = Op->isStrictFPOpcode();
6594 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
6595 unsigned OpNo = IsStrict ? 1 : 0;
6596 SDValue Chain;
6597 if (IsStrict)
6598 Chain = Op.getOperand(0);
6599 SDValue LHS = Op.getOperand(OpNo + 0);
6600 SDValue RHS = Op.getOperand(OpNo + 1);
6601 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
6602 SDLoc dl(Op);
6603
6604 // We chose ZeroOrOneBooleanContents, so use zero and one.
6605 EVT VT = Op.getValueType();
6606 SDValue TVal = DAG.getConstant(1, dl, VT);
6607 SDValue FVal = DAG.getConstant(0, dl, VT);
6608
6609 // Handle f128 first, since one possible outcome is a normal integer
6610 // comparison which gets picked up by the next if statement.
6611 if (LHS.getValueType() == MVT::f128) {
6612 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
6613 IsSignaling);
6614
6615 // If softenSetCCOperands returned a scalar, use it.
6616 if (!RHS.getNode()) {
6617 assert(LHS.getValueType() == Op.getValueType() &&
6618 "Unexpected setcc expansion!");
6619 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
6620 }
6621 }
6622
6623 if (LHS.getValueType().isInteger()) {
6624 SDValue CCVal;
6626 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
6627
6628 // Note that we inverted the condition above, so we reverse the order of
6629 // the true and false operands here. This will allow the setcc to be
6630 // matched to a single CSINC instruction.
6631 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
6632 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
6633 }
6634
6635 // Now we know we're dealing with FP values.
6636 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
6637 LHS.getValueType() == MVT::f64);
6638
6639 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
6640 // and do the comparison.
6641 SDValue Cmp;
6642 if (IsStrict)
6643 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
6644 else
6645 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
6646
6649 SDValue Res;
6650 if (CC2 == AArch64CC::AL) {
6651 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
6652 CC2);
6654
6655 // Note that we inverted the condition above, so we reverse the order of
6656 // the true and false operands here. This will allow the setcc to be
6657 // matched to a single CSINC instruction.
6658 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
6659 } else {
6660 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
6661 // totally clean. Some of them require two CSELs to implement. As is in
6662 // this case, we emit the first CSEL and then emit a second using the output
6663 // of the first as the RHS. We're effectively OR'ing the two CC's together.
6664
6665 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
6667 SDValue CS1 =
6668 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
6669
6671 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
6672 }
6673 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
6674}
6675
6676SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
6677 SDValue RHS, SDValue TVal,
6678 SDValue FVal, const SDLoc &dl,
6679 SelectionDAG &DAG) const {
6680 // Handle f128 first, because it will result in a comparison of some RTLIB
6681 // call result against zero.
6682 if (LHS.getValueType() == MVT::f128) {
6683 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
6684
6685 // If softenSetCCOperands returned a scalar, we need to compare the result
6686 // against zero to select between true and false values.
6687 if (!RHS.getNode()) {
6688 RHS = DAG.getConstant(0, dl, LHS.getValueType());
6689 CC = ISD::SETNE;
6690 }
6691 }
6692
6693 // Also handle f16, for which we need to do a f32 comparison.
6694 if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
6695 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
6696 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
6697 }
6698
6699 // Next, handle integers.
6700 if (LHS.getValueType().isInteger()) {
6701 assert((LHS.getValueType() == RHS.getValueType()) &&
6702 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
6703
6704 unsigned Opcode = AArch64ISD::CSEL;
6705
6706 // If both the TVal and the FVal are constants, see if we can swap them in
6707 // order to for a CSINV or CSINC out of them.
6710
6711 if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
6714 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
6715 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
6718 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
6719 } else if (TVal.getOpcode() == ISD::XOR) {
6720 // If TVal is a NOT we want to swap TVal and FVal so that we can match
6721 // with a CSINV rather than a CSEL.
6722 if (isAllOnesConstant(TVal.getOperand(1))) {
6725 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
6726 }
6727 } else if (TVal.getOpcode() == ISD::SUB) {
6728 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
6729 // that we can match with a CSNEG rather than a CSEL.
6730 if (isNullConstant(TVal.getOperand(0))) {
6733 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
6734 }
6735 } else if (CTVal && CFVal) {
6736 const int64_t TrueVal = CTVal->getSExtValue();
6737 const int64_t FalseVal = CFVal->getSExtValue();
6738 bool Swap = false;
6739
6740 // If both TVal and FVal are constants, see if FVal is the
6741 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
6742 // instead of a CSEL in that case.
6743 if (TrueVal == ~FalseVal) {
6744 Opcode = AArch64ISD::CSINV;
6745 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
6746 TrueVal == -FalseVal) {
6747 Opcode = AArch64ISD::CSNEG;
6748 } else if (TVal.getValueType() == MVT::i32) {
6749 // If our operands are only 32-bit wide, make sure we use 32-bit
6750 // arithmetic for the check whether we can use CSINC. This ensures that
6751 // the addition in the check will wrap around properly in case there is
6752 // an overflow (which would not be the case if we do the check with
6753 // 64-bit arithmetic).
6754 const uint32_t TrueVal32 = CTVal->getZExtValue();
6755 const uint32_t FalseVal32 = CFVal->getZExtValue();
6756
6757 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
6758 Opcode = AArch64ISD::CSINC;
6759
6760 if (TrueVal32 > FalseVal32) {
6761 Swap = true;
6762 }
6763 }
6764 // 64-bit check whether we can use CSINC.
6765 } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
6766 Opcode = AArch64ISD::CSINC;
6767
6768 if (TrueVal > FalseVal) {
6769 Swap = true;
6770 }
6771 }
6772
6773 // Swap TVal and FVal if necessary.
6774 if (Swap) {
6777 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
6778 }
6779
6780 if (Opcode != AArch64ISD::CSEL) {
6781 // Drop FVal since we can get its value by simply inverting/negating
6782 // TVal.
6783 FVal = TVal;
6784 }
6785 }
6786
6787 // Avoid materializing a constant when possible by reusing a known value in
6788 // a register. However, don't perform this optimization if the known value
6789 // is one, zero or negative one in the case of a CSEL. We can always
6790 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
6791 // FVal, respectively.
6793 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
6794 !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
6796 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
6797 // "a != C ? x : a" to avoid materializing C.
6798 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
6799 TVal = LHS;
6800 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
6801 FVal = LHS;
6802 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
6803 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
6804 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
6805 // avoid materializing C.
6807 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
6808 Opcode = AArch64ISD::CSINV;
6809 TVal = LHS;
6810 FVal = DAG.getConstant(0, dl, FVal.getValueType());
6811 }
6812 }
6813
6814 SDValue CCVal;
6815 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
6816 EVT VT = TVal.getValueType();
6817 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
6818 }
6819
6820 // Now we know we're dealing with FP values.
6821 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
6822 LHS.getValueType() == MVT::f64);
6823 assert(LHS.getValueType() == RHS.getValueType());
6824 EVT VT = TVal.getValueType();
6825 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
6826
6827 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
6828 // clean. Some of them require two CSELs to implement.
6831
6832 if (DAG.getTarget().Options.UnsafeFPMath) {
6833 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
6834 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
6836 if (RHSVal && RHSVal->isZero()) {
6839
6840 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
6841 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
6842 TVal = LHS;
6843 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
6844 CFVal && CFVal->isZero() &&
6845 FVal.getValueType() == LHS.getValueType())
6846 FVal = LHS;
6847 }
6848 }
6849
6850 // Emit first, and possibly only, CSEL.
6852 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
6853
6854 // If we need a second CSEL, emit it, using the output of the first as the
6855 // RHS. We're effectively OR'ing the two CC's together.
6856 if (CC2 != AArch64CC::AL) {
6858 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
6859 }
6860
6861 // Otherwise, return the output of the first CSEL.
6862 return CS1;
6863}
6864
6865SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
6866 SelectionDAG &DAG) const {
6867 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
6868 SDValue LHS = Op.getOperand(0);
6869 SDValue RHS = Op.getOperand(1);
6870 SDValue TVal = Op.getOperand(2);
6871 SDValue FVal = Op.getOperand(3);
6872 SDLoc DL(Op);
6873 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
6874}
6875
6876SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
6877 SelectionDAG &DAG) const {
6878 SDValue CCVal = Op->getOperand(0);
6879 SDValue TVal = Op->getOperand(1);
6880 SDValue FVal = Op->getOperand(2);
6881 SDLoc DL(Op);
6882
6883 EVT Ty = Op.getValueType();
6884 if (Ty.isScalableVector()) {
6886 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
6888 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
6889 }
6890
6891 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
6892 // instruction.
6893 if (ISD::isOverflowIntrOpRes(CCVal)) {
6894 // Only lower legal XALUO ops.
6895 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
6896 return SDValue();
6897
6900 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
6901 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
6902
6903 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
6904 CCVal, Overflow);
6905 }
6906
6907 // Lower it the same way as we would lower a SELECT_CC node.
6908 ISD::CondCode CC;
6909 SDValue LHS, RHS;
6910 if (CCVal.getOpcode() == ISD::SETCC) {
6911 LHS = CCVal.getOperand(0);
6912 RHS = CCVal.getOperand(1);
6913 CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
6914 } else {
6915 LHS = CCVal;
6916 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
6917 CC = ISD::SETNE;
6918 }
6919 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
6920}
6921
6922SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
6923 SelectionDAG &DAG) const {
6924 // Jump table entries as PC relative offsets. No additional tweaking
6925 // is necessary here. Just get the address of the jump table.
6927
6929 !Subtarget->isTargetMachO()) {
6930 return getAddrLarge(JT, DAG);
6931 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
6932 return getAddrTiny(JT, DAG);
6933 }
6934 return getAddr(JT, DAG);
6935}
6936
6937SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
6938 SelectionDAG &DAG) const {
6939 // Jump table entries as PC relative offsets. No additional tweaking
6940 // is necessary here. Just get the address of the jump table.
6941 SDLoc DL(Op);
6942 SDValue JT = Op.getOperand(1);
6943 SDValue Entry = Op.getOperand(2);
6944 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
6945
6946 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
6947 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
6948
6949 SDNode *Dest =
6950 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
6951 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
6952 return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
6953 SDValue(Dest, 0));
6954}
6955
6956SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
6957 SelectionDAG &DAG) const {
6959
6961 // Use the GOT for the large code model on iOS.
6962 if (Subtarget->isTargetMachO()) {
6963 return getGOT(CP, DAG);
6964 }
6965 return getAddrLarge(CP, DAG);
6966 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
6967 return getAddrTiny(CP, DAG);
6968 } else {
6969 return getAddr(CP, DAG);
6970 }
6971}
6972
6973SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
6974 SelectionDAG &DAG) const {
6977 !Subtarget->isTargetMachO()) {
6978 return getAddrLarge(BA, DAG);
6979 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
6980 return getAddrTiny(BA, DAG);
6981 }
6982 return getAddr(BA, DAG);
6983}
6984
6985SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
6986 SelectionDAG &DAG) const {
6987 AArch64FunctionInfo *FuncInfo =
6989
6990 SDLoc DL(Op);
6991 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
6994 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
6995 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
6996 MachinePointerInfo(SV));
6997}
6998
6999SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
7000 SelectionDAG &DAG) const {
7001 AArch64FunctionInfo *FuncInfo =
7003
7004 SDLoc DL(Op);
7005 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
7006 ? FuncInfo->getVarArgsGPRIndex()
7007 : FuncInfo->getVarArgsStackIndex(),
7008 getPointerTy(DAG.getDataLayout()));
7009 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7010 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
7011 MachinePointerInfo(SV));
7012}
7013
7014SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
7015 SelectionDAG &DAG) const {
7016 // The layout of the va_list struct is specified in the AArch64 Procedure Call
7017 // Standard, section B.3.
7020 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
7022 auto PtrVT = getPointerTy(DAG.getDataLayout());
7023 SDLoc DL(Op);
7024
7025 SDValue Chain = Op.getOperand(0);
7026 SDValue VAList = Op.getOperand(1);
7027 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7029
7030 // void *__stack at offset 0
7031 unsigned Offset = 0;
7033 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
7034 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
7036
7037 // void *__gr_top at offset 8 (4 on ILP32)
7038 Offset += PtrSize;
7039 int GPRSize = FuncInfo->getVarArgsGPRSize();
7040 if (GPRSize > 0) {
7042
7044 DAG.getConstant(Offset, DL, PtrVT));
7045
7046 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
7048 DAG.getConstant(GPRSize, DL, PtrVT));
7050
7051 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
7053 Align(PtrSize)));
7054 }
7055
7056 // void *__vr_top at offset 16 (8 on ILP32)
7057 Offset += PtrSize;
7058 int FPRSize = FuncInfo->getVarArgsFPRSize();
7059 if (FPRSize > 0) {
7062 DAG.getConstant(Offset, DL, PtrVT));
7063
7064 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
7066 DAG.getConstant(FPRSize, DL, PtrVT));
7068
7069 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
7071 Align(PtrSize)));
7072 }
7073
7074 // int __gr_offs at offset 24 (12 on ILP32)
7075 Offset += PtrSize;
7077 DAG.getConstant(Offset, DL, PtrVT));
7078 MemOps.push_back(
7079 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
7081
7082 // int __vr_offs at offset 28 (16 on ILP32)
7083 Offset += 4;
7085 DAG.getConstant(Offset, DL, PtrVT));
7086 MemOps.push_back(
7087 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
7089
7091}
7092
7093SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
7094 SelectionDAG &DAG) const {
7096
7097 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
7098 return LowerWin64_VASTART(Op, DAG);
7099 else if (Subtarget->isTargetDarwin())
7100 return LowerDarwin_VASTART(Op, DAG);
7101 else
7102 return LowerAAPCS_VASTART(Op, DAG);
7103}
7104
7105SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
7106 SelectionDAG &DAG) const {
7107 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
7108 // pointer.
7109 SDLoc DL(Op);
7110 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
7111 unsigned VaListSize =
7112 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7113 ? PtrSize
7114 : Subtarget->isTargetILP32() ? 20 : 32;
7115 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
7116 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7117
7118 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
7120 Align(PtrSize), false, false, false,
7122}
7123
7124SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
7125 assert(Subtarget->isTargetDarwin() &&
7126 "automatic va_arg instruction only works on Darwin");
7127
7128 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7129 EVT VT = Op.getValueType();
7130 SDLoc DL(Op);
7131 SDValue Chain = Op.getOperand(0);
7132 SDValue Addr = Op.getOperand(1);
7133 MaybeAlign Align(Op.getConstantOperandVal(3));
7134 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
7135 auto PtrVT = getPointerTy(DAG.getDataLayout());
7137 SDValue VAList =
7138 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
7139 Chain = VAList.getValue(1);
7141
7142 if (VT.isScalableVector())
7143 report_fatal_error("Passing SVE types to variadic functions is "
7144 "currently not supported");
7145
7146 if (Align && *Align > MinSlotSize) {
7148 DAG.getConstant(Align->value() - 1, DL, PtrVT));
7150 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
7151 }
7152
7153 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
7154 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
7155
7156 // Scalar integer and FP values smaller than 64 bits are implicitly extended
7157 // up to 64 bits. At the very least, we have to increase the striding of the
7158 // vaargs list to match this, and for FP values we need to introduce
7159 // FP_ROUND nodes as well.
7160 if (VT.isInteger() && !VT.isVector())
7161 ArgSize = std::max(ArgSize, MinSlotSize);
7162 bool NeedFPTrunc = false;
7163 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
7164 ArgSize = 8;
7165 NeedFPTrunc = true;
7166 }
7167
7168 // Increment the pointer, VAList, to the next vaarg
7170 DAG.getConstant(ArgSize, DL, PtrVT));
7172
7173 // Store the incremented VAList to the legalized pointer
7175 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
7176
7177 // Load the actual argument out of the pointer VAList
7178 if (NeedFPTrunc) {
7179 // Load the value as an f64.
7180 SDValue WideFP =
7182 // Round the value down to an f32.
7183 SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
7184 DAG.getIntPtrConstant(1, DL));
7185 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
7186 // Merge the rounded value with the chain output of the load.
7187 return DAG.getMergeValues(Ops, DL);
7188 }
7189
7190 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
7191}
7192
7193SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
7194 SelectionDAG &DAG) const {
7196 MFI.setFrameAddressIsTaken(true);
7197
7198 EVT VT = Op.getValueType();
7199 SDLoc DL(Op);
7200 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7202 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
7203 while (Depth--)
7204 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
7206
7207 if (Subtarget->isTargetILP32())
7209 DAG.getValueType(VT));
7210
7211 return FrameAddr;
7212}
7213
7214SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
7215 SelectionDAG &DAG) const {
7217
7218 EVT VT = getPointerTy(DAG.getDataLayout());
7219 SDLoc DL(Op);
7220 int FI = MFI.CreateFixedObject(4, 0, false);
7221 return DAG.getFrameIndex(FI, VT);
7222}
7223
7224#define GET_REGISTER_MATCHER
7225#include "AArch64GenAsmMatcher.inc"
7226
7227// FIXME? Maybe this could be a TableGen attribute on some registers and
7228// this table could be generated automatically from RegInfo.
7229Register AArch64TargetLowering::
7230getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
7232 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
7233 const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
7234 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
7235 if (!Subtarget->isXRegisterReserved(DwarfRegNum))
7236 Reg = 0;
7237 }
7238 if (Reg)
7239 return Reg;
7240 report_fatal_error(Twine("Invalid register name \""
7241 + StringRef(RegName) + "\"."));
7242}
7243
7244SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
7245 SelectionDAG &DAG) const {
7247
7248 EVT VT = Op.getValueType();
7249 SDLoc DL(Op);
7250
7252 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
7254
7255 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
7256}
7257
7258SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
7259 SelectionDAG &DAG) const {
7261 MachineFrameInfo &MFI = MF.getFrameInfo();
7262 MFI.setReturnAddressIsTaken(true);
7263
7264 EVT VT = Op.getValueType();
7265 SDLoc DL(Op);
7266 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7268 if (Depth) {
7269 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
7271 ReturnAddress = DAG.getLoad(
7272 VT, DL, DAG.getEntryNode(),
7274 } else {
7275 // Return LR, which contains the return address. Mark it an implicit
7276 // live-in.
7277 unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
7278 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7279 }
7280
7281 // The XPACLRI instruction assembles to a hint-space instruction before
7282 // Armv8.3-A therefore this instruction can be safely used for any pre
7283 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
7284 // that instead.
7285 SDNode *St;
7286 if (Subtarget->hasPAuth()) {
7287 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
7288 } else {
7289 // XPACLRI operates on LR therefore we must move the operand accordingly.
7290 SDValue Chain =
7291 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
7292 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
7293 }
7294 return SDValue(St, 0);
7295}
7296
7297/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
7298/// i64 values and take a 2 x i64 value to shift plus a shift amount.
7299SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
7300 SelectionDAG &DAG) const {
7301 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
7302 EVT VT = Op.getValueType();
7303 unsigned VTBits = VT.getSizeInBits();
7304 SDLoc dl(Op);
7305 SDValue ShOpLo = Op.getOperand(0);
7306 SDValue ShOpHi = Op.getOperand(1);
7307 SDValue ShAmt = Op.getOperand(2);
7308 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
7309
7310 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
7311
7313 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
7315
7316 // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
7317 // is "undef". We wanted 0, so CSEL it directly.
7318 SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
7319 ISD::SETEQ, dl, DAG);
7320 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
7321 HiBitsForLo =
7322 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
7323 HiBitsForLo, CCVal, Cmp);
7324
7325 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
7326 DAG.getConstant(VTBits, dl, MVT::i64));
7327
7328 SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
7330 DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
7331
7333 dl, DAG);
7334 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
7335 SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
7337 LoForNormalShift, CCVal, Cmp);
7338
7339 // AArch64 shifts larger than the register width are wrapped rather than
7340 // clamped, so we can't just emit "hi >> x".
7341 SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
7343 Opc == ISD::SRA
7344 ? DAG.getNode(Opc, dl, VT, ShOpHi,
7345 DAG.getConstant(VTBits - 1, dl, MVT::i64))
7346 : DAG.getConstant(0, dl, VT);
7348 HiForNormalShift, CCVal, Cmp);
7349
7350 SDValue Ops[2] = { Lo, Hi };
7351 return DAG.getMergeValues(Ops, dl);
7352}
7353
7354/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
7355/// i64 values and take a 2 x i64 value to shift plus a shift amount.
7356SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
7357 SelectionDAG &DAG) const {
7358 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
7359 EVT VT = Op.getValueType();
7360 unsigned VTBits = VT.getSizeInBits();
7361 SDLoc dl(Op);
7362 SDValue ShOpLo = Op.getOperand(0);
7363 SDValue ShOpHi = Op.getOperand(1);
7364 SDValue ShAmt = Op.getOperand(2);
7365
7366 assert(Op.getOpcode() == ISD::SHL_PARTS);
7368 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
7370
7371 // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
7372 // is "undef". We wanted 0, so CSEL it directly.
7373 SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
7374 ISD::SETEQ, dl, DAG);
7375 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
7376 LoBitsForHi =
7377 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
7378 LoBitsForHi, CCVal, Cmp);
7379
7380 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
7381 DAG.getConstant(VTBits, dl, MVT::i64));
7382 SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
7384 DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
7385
7387
7389 dl, DAG);
7390 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
7392 HiForNormalShift, CCVal, Cmp);
7393
7394 // AArch64 shifts of larger than register sizes are wrapped rather than
7395 // clamped, so we can't just emit "lo << a" if a is too big.
7396 SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
7397 SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
7399 LoForNormalShift, CCVal, Cmp);
7400
7401 SDValue Ops[2] = { Lo, Hi };
7402 return DAG.getMergeValues(Ops, dl);
7403}
7404
7406 const GlobalAddressSDNode *GA) const {
7407 // Offsets are folded in the DAG combine rather than here so that we can
7408 // intelligently choose an offset based on the uses.
7409 return false;
7410}
7411
7413 bool OptForSize) const {
7414 bool IsLegal = false;
7415 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
7416 // 16-bit case when target has full fp16 support.
7417 // FIXME: We should be able to handle f128 as well with a clever lowering.
7418 const APInt ImmInt = Imm.bitcastToAPInt();
7419 if (VT == MVT::f64)
7421 else if (VT == MVT::f32)
7423 else if (VT == MVT::f16 && Subtarget->hasFullFP16())
7425 // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
7426 // generate that fmov.
7427
7428 // If we can not materialize in immediate field for fmov, check if the
7429 // value can be encoded as the immediate operand of a logical instruction.
7430 // The immediate value will be created with either MOVZ, MOVN, or ORR.
7431 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
7432 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
7433 // however the mov+fmov sequence is always better because of the reduced
7434 // cache pressure. The timings are still the same if you consider
7435 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
7436 // movw+movk is fused). So we limit up to 2 instrdduction at most.
7438 AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
7439 Insn);
7440 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
7441 IsLegal = Insn.size() <= Limit;
7442 }
7443
7444 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
7445 << " imm value: "; Imm.dump(););
7446 return IsLegal;
7447}
7448
7449//===----------------------------------------------------------------------===//
7450// AArch64 Optimization Hooks
7451//===----------------------------------------------------------------------===//
7452
7453static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
7454 SDValue Operand, SelectionDAG &DAG,
7455 int &ExtraSteps) {
7456 EVT VT = Operand.getValueType();
7457 if (ST->hasNEON() &&
7458 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
7459 VT == MVT::f32 || VT == MVT::v1f32 ||
7460 VT == MVT::v2f32 || VT == MVT::v4f32)) {
7462 // For the reciprocal estimates, convergence is quadratic, so the number
7463 // of digits is doubled after each iteration. In ARMv8, the accuracy of
7464 // the initial estimate is 2^-8. Thus the number of extra steps to refine
7465 // the result for float (23 mantissa bits) is 2 and for double (52
7466 // mantissa bits) is 3.
7467 ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
7468
7469 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
7470 }
7471
7472 return SDValue();
7473}
7474
7475SDValue
7476AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
7477 const DenormalMode &Mode) const {
7478 SDLoc DL(Op);
7479 EVT VT = Op.getValueType();
7480 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
7481 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
7482 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
7483}
7484
7485SDValue
7486AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
7487 SelectionDAG &DAG) const {
7488 return Op;
7489}
7490
7491SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
7492 SelectionDAG &DAG, int Enabled,
7493 int &ExtraSteps,
7494 bool &UseOneConst,
7495 bool Reciprocal) const {
7498 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
7499 DAG, ExtraSteps)) {
7500 SDLoc DL(Operand);
7501 EVT VT = Operand.getValueType();
7502
7503 SDNodeFlags Flags;
7504 Flags.setAllowReassociation(true);
7505
7506 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
7507 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
7508 for (int i = ExtraSteps; i > 0; --i) {
7509 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
7510 Flags);
7511 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
7512 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
7513 }
7514 if (!Reciprocal)
7515 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
7516
7517 ExtraSteps = 0;
7518 return Estimate;
7519 }
7520
7521 return SDValue();
7522}
7523
7524SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
7525 SelectionDAG &DAG, int Enabled,
7526 int &ExtraSteps) const {
7528 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
7529 DAG, ExtraSteps)) {
7530 SDLoc DL(Operand);
7531 EVT VT = Operand.getValueType();
7532
7533 SDNodeFlags Flags;
7534 Flags.setAllowReassociation(true);
7535
7536 // Newton reciprocal iteration: E * (2 - X * E)
7537 // AArch64 reciprocal iteration instruction: (2 - M * N)
7538 for (int i = ExtraSteps; i > 0; --i) {
7539 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
7540 Estimate, Flags);
7541 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
7542 }
7543
7544 ExtraSteps = 0;
7545 return Estimate;
7546 }
7547
7548 return SDValue();
7549}
7550
7551//===----------------------------------------------------------------------===//
7552// AArch64 Inline Assembly Support
7553//===----------------------------------------------------------------------===//
7554
7555// Table of Constraints
7556// TODO: This is the current set of constraints supported by ARM for the
7557// compiler, not all of them may make sense.
7558//
7559// r - A general register
7560// w - An FP/SIMD register of some size in the range v0-v31
7561// x - An FP/SIMD register of some size in the range v0-v15
7562// I - Constant that can be used with an ADD instruction
7563// J - Constant that can be used with a SUB instruction
7564// K - Constant that can be used with a 32-bit logical instruction
7565// L - Constant that can be used with a 64-bit logical instruction
7566// M - Constant that can be used as a 32-bit MOV immediate
7567// N - Constant that can be used as a 64-bit MOV immediate
7568// Q - A memory reference with base register and no offset
7569// S - A symbolic address
7570// Y - Floating point constant zero
7571// Z - Integer constant zero
7572//
7573// Note that general register operands will be output using their 64-bit x
7574// register name, whatever the size of the variable, unless the asm operand
7575// is prefixed by the %w modifier. Floating-point and SIMD register operands
7576// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
7577// %q modifier.
7578const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
7579 // At this point, we have to lower this constraint to something else, so we
7580 // lower it to an "r" or "w". However, by doing this we will force the result
7581 // to be in register, while the X constraint is much more permissive.
7582 //
7583 // Although we are correct (we are free to emit anything, without
7584 // constraints), we might break use cases that would expect us to be more
7585 // efficient and emit something else.
7586 if (!Subtarget->hasFPARMv8())
7587 return "r";
7588
7589 if (ConstraintVT.isFloatingPoint())
7590 return "w";
7591
7592 if (ConstraintVT.isVector() &&
7593 (ConstraintVT.getSizeInBits() == 64 ||
7594 ConstraintVT.getSizeInBits() == 128))
7595 return "w";
7596
7597 return "r";
7598}
7599
7605
7608 if (Constraint == "Upa")
7610 if (Constraint == "Upl")
7612 return P;
7613}
7614
7615/// getConstraintType - Given a constraint letter, return the type of
7616/// constraint it is for this target.
7618AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
7619 if (Constraint.size() == 1) {
7620 switch (Constraint[0]) {
7621 default:
7622 break;
7623 case 'x':
7624 case 'w':
7625 case 'y':
7626 return C_RegisterClass;
7627 // An address with a single base register. Due to the way we
7628 // currently handle addresses it is the same as 'r'.
7629 case 'Q':
7630 return C_Memory;
7631 case 'I':
7632 case 'J':
7633 case 'K':
7634 case 'L':
7635 case 'M':
7636 case 'N':
7637 case 'Y':
7638 case 'Z':
7639 return C_Immediate;
7640 case 'z':
7641 case 'S': // A symbolic address
7642 return C_Other;
7643 }
7644 } else if (parsePredicateConstraint(Constraint) !=
7646 return C_RegisterClass;
7647 return TargetLowering::getConstraintType(Constraint);
7648}
7649
7650/// Examine constraint type and operand type and determine a weight value.
7651/// This object must already have been set up with the operand type
7652/// and the current alternative constraint selected.
7654AArch64TargetLowering::getSingleConstraintMatchWeight(
7655 AsmOperandInfo &info, const char *constraint) const {
7657 Value *CallOperandVal = info.CallOperandVal;
7658 // If we don't have a value, we can't do a match,
7659 // but allow it at the lowest weight.
7660 if (!CallOperandVal)
7661 return CW_Default;
7662 Type *type = CallOperandVal->getType();
7663 // Look at the constraint type.
7664 switch (*constraint) {
7665 default:
7667 break;
7668 case 'x':
7669 case 'w':
7670 case 'y':
7671 if (type->isFloatingPointTy() || type->isVectorTy())
7672 weight = CW_Register;
7673 break;
7674 case 'z':
7675 weight = CW_Constant;
7676 break;
7677 case 'U':
7679 weight = CW_Register;
7680 break;
7681 }
7682 return weight;
7683}
7684
7685std::pair<unsigned, const TargetRegisterClass *>
7686AArch64TargetLowering::getRegForInlineAsmConstraint(
7687 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
7688 if (Constraint.size() == 1) {
7689 switch (Constraint[0]) {
7690 case 'r':
7691 if (VT.isScalableVector())
7692 return std::make_pair(0U, nullptr);
7693 if (VT.getFixedSizeInBits() == 64)
7694 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
7695 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
7696 case 'w': {
7697 if (!Subtarget->hasFPARMv8())
7698 break;
7699 if (VT.isScalableVector()) {
7700 if (VT.getVectorElementType() != MVT::i1)
7701 return std::make_pair(0U, &AArch64::ZPRRegClass);
7702 return std::make_pair(0U, nullptr);
7703 }
7704 uint64_t VTSize = VT.getFixedSizeInBits();
7705 if (VTSize == 16)
7706 return std::make_pair(0U, &AArch64::FPR16RegClass);
7707 if (VTSize == 32)
7708 return std::make_pair(0U, &AArch64::FPR32RegClass);
7709 if (VTSize == 64)
7710 return std::make_pair(0U, &AArch64::FPR64RegClass);
7711 if (VTSize == 128)
7712 return std::make_pair(0U, &AArch64::FPR128RegClass);
7713 break;
7714 }
7715 // The instructions that this constraint is designed for can
7716 // only take 128-bit registers so just use that regclass.
7717 case 'x':
7718 if (!Subtarget->hasFPARMv8())
7719 break;
7720 if (VT.isScalableVector())
7721 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
7722 if (VT.getSizeInBits() == 128)
7723 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
7724 break;
7725 case 'y':
7726 if (!Subtarget->hasFPARMv8())
7727 break;
7728 if (VT.isScalableVector())
7729 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
7730 break;
7731 }
7732 } else {
7734 if (PC != PredicateConstraint::Invalid) {
7735 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
7736 return std::make_pair(0U, nullptr);
7737 bool restricted = (PC == PredicateConstraint::Upl);
7738 return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
7739 : std::make_pair(0U, &AArch64::PPRRegClass);
7740 }
7741 }
7742 if (StringRef("{cc}").equals_lower(Constraint))
7743 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
7744
7745 // Use the default implementation in TargetLowering to convert the register
7746 // constraint into a member of a register class.
7747 std::pair<unsigned, const TargetRegisterClass *> Res;
7749
7750 // Not found as a standard register?
7751 if (!Res.second) {
7752 unsigned Size = Constraint.size();
7753 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
7754 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
7755 int RegNo;
7756 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
7757 if (!Failed && RegNo >= 0 && RegNo <= 31) {
7758 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
7759 // By default we'll emit v0-v31 for this unless there's a modifier where
7760 // we'll emit the correct register as well.
7761 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
7762 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
7763 Res.second = &AArch64::FPR64RegClass;
7764 } else {
7765 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
7766 Res.second = &AArch64::FPR128RegClass;
7767 }
7768 }
7769 }
7770 }
7771
7772 if (Res.second && !Subtarget->hasFPARMv8() &&
7773 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
7774 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
7775 return std::make_pair(0U, nullptr);
7776
7777 return Res;
7778}
7779
7780/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
7781/// vector. If it is invalid, don't add anything to Ops.
7782void AArch64TargetLowering::LowerAsmOperandForConstraint(
7783 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
7784 SelectionDAG &DAG) const {
7786
7787 // Currently only support length 1 constraints.
7788 if (Constraint.length() != 1)
7789 return;
7790
7791 char ConstraintLetter = Constraint[0];
7792 switch (ConstraintLetter) {
7793 default:
7794 break;
7795
7796 // This set of constraints deal with valid constants for various instructions.
7797 // Validate and return a target constant for them if we can.
7798 case 'z': {
7799 // 'z' maps to xzr or wzr so it needs an input of 0.
7800 if (!isNullConstant(Op))
7801 return;
7802
7803 if (Op.getValueType() == MVT::i64)
7804 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
7805 else
7806 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
7807 break;
7808 }
7809 case 'S': {
7810 // An absolute symbolic address or label reference.
7812 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
7813 GA->getValueType(0));
7814 } else if (const BlockAddressSDNode *BA =
7816 Result =
7818 } else if (const ExternalSymbolSDNode *ES =
7820 Result =
7821 DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0));
7822 } else
7823 return;
7824 break;
7825 }
7826
7827 case 'I':
7828 case 'J':
7829 case 'K':
7830 case 'L':
7831 case 'M':
7832 case 'N':
7834 if (!C)
7835 return;
7836
7837 // Grab the value and do some validation.
7838 uint64_t CVal = C->getZExtValue();
7839 switch (ConstraintLetter) {
7840 // The I constraint applies only to simple ADD or SUB immediate operands:
7841 // i.e. 0 to 4095 with optional shift by 12
7842 // The J constraint applies only to ADD or SUB immediates that would be
7843 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
7844 // instruction [or vice versa], in other words -1 to -4095 with optional
7845 // left shift by 12.
7846 case 'I':
7847 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
7848 break;
7849 return;
7850 case 'J': {
7851 uint64_t NVal = -C->getSExtValue();
7853 CVal = C->getSExtValue();
7854 break;
7855 }
7856 return;
7857 }
7858 // The K and L constraints apply *only* to logical immediates, including
7859 // what used to be the MOVI alias for ORR (though the MOVI alias has now
7860 // been removed and MOV should be used). So these constraints have to
7861 // distinguish between bit patterns that are valid 32-bit or 64-bit
7862 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
7863 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
7864 // versa.
7865 case 'K':
7866 if (AArch64_AM::isLogicalImmediate(CVal, 32))
7867 break;
7868 return;
7869 case 'L':
7870 if (AArch64_AM::isLogicalImmediate(CVal, 64))
7871 break;
7872 return;
7873 // The M and N constraints are a superset of K and L respectively, for use
7874 // with the MOV (immediate) alias. As well as the logical immediates they
7875 // also match 32 or 64-bit immediates that can be loaded either using a
7876 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
7877 // (M) or 64-bit 0x1234000000000000 (N) etc.
7878 // As a note some of this code is liberally stolen from the asm parser.
7879 case 'M': {
7880 if (!isUInt<32>(CVal))
7881 return;
7882 if (AArch64_AM::isLogicalImmediate(CVal, 32))
7883 break;
7884 if ((CVal & 0xFFFF) == CVal)
7885 break;
7886 if ((CVal & 0xFFFF0000ULL) == CVal)
7887 break;
7888 uint64_t NCVal = ~(uint32_t)CVal;
7889 if ((NCVal & 0xFFFFULL) == NCVal)
7890 break;
7891 if ((NCVal & 0xFFFF0000ULL) == NCVal)
7892 break;
7893 return;
7894 }
7895 case 'N': {
7896 if (AArch64_AM::isLogicalImmediate(CVal, 64))
7897 break;
7898 if ((CVal & 0xFFFFULL) == CVal)
7899 break;
7900 if ((CVal & 0xFFFF0000ULL) == CVal)
7901 break;
7902 if ((CVal & 0xFFFF00000000ULL) == CVal)
7903 break;
7904 if ((CVal & 0xFFFF000000000000ULL) == CVal)
7905 break;
7906 uint64_t NCVal = ~CVal;
7907 if ((NCVal & 0xFFFFULL) == NCVal)
7908 break;
7909 if ((NCVal & 0xFFFF0000ULL) == NCVal)
7910 break;
7911 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
7912 break;
7913 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
7914 break;
7915 return;
7916 }
7917 default:
7918 return;
7919 }
7920
7921 // All assembler immediates are 64-bit integers.
7922 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
7923 break;
7924 }
7925
7926 if (Result.getNode()) {
7927 Ops.push_back(Result);
7928 return;
7929 }
7930
7931 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
7932}
7933
7934//===----------------------------------------------------------------------===//
7935// AArch64 Advanced SIMD Support
7936//===----------------------------------------------------------------------===//
7937
7938/// WidenVector - Given a value in the V64 register class, produce the
7939/// equivalent value in the V128 register class.
7941 EVT VT = V64Reg.getValueType();
7942 unsigned NarrowSize = VT.getVectorNumElements();
7943 MVT EltTy = VT.getVectorElementType().getSimpleVT();
7944 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
7945 SDLoc DL(V64Reg);
7946
7948 V64Reg, DAG.getConstant(0, DL, MVT::i32));
7949}
7950
7951/// getExtFactor - Determine the adjustment factor for the position when
7952/// generating an "extract from vector registers" instruction.
7953static unsigned getExtFactor(SDValue &V) {
7954 EVT EltType = V.getValueType().getVectorElementType();
7955 return EltType.getSizeInBits() / 8;
7956}
7957
7958/// NarrowVector - Given a value in the V128 register class, produce the
7959/// equivalent value in the V64 register class.
7961 EVT VT = V128Reg.getValueType();
7962 unsigned WideSize = VT.getVectorNumElements();
7963 MVT EltTy = VT.getVectorElementType().getSimpleVT();
7964 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
7965 SDLoc DL(V128Reg);
7966
7967 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
7968}
7969
7970// Gather data to see if the operation can be modelled as a
7971// shuffle in combination with VEXTs.
7973 SelectionDAG &DAG) const {
7974 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7975 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
7976 SDLoc dl(Op);
7977 EVT VT = Op.getValueType();
7978 assert(!VT.isScalableVector() &&
7979 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
7980 unsigned NumElts = VT.getVectorNumElements();
7981
7982 struct ShuffleSourceInfo {
7983 SDValue Vec;
7984 unsigned MinElt;
7985 unsigned MaxElt;
7986
7987 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7988 // be compatible with the shuffle we intend to construct. As a result
7989 // ShuffleVec will be some sliding window into the original Vec.
7991
7992 // Code should guarantee that element i in Vec starts at element "WindowBase
7993 // + i * WindowScale in ShuffleVec".
7994 int WindowBase;
7995 int WindowScale;
7996
7998 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
7999 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
8000
8001 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
8002 };
8003
8004 // First gather all vectors used as an immediate source for this BUILD_VECTOR
8005 // node.
8007 for (unsigned i = 0; i < NumElts; ++i) {
8008 SDValue V = Op.getOperand(i);
8009 if (V.isUndef())
8010 continue;
8011 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8012 !isa<ConstantSDNode>(V.getOperand(1))) {
8013 LLVM_DEBUG(
8014 dbgs() << "Reshuffle failed: "
8015 "a shuffle can only come from building a vector from "
8016 "various elements of other vectors, provided their "
8017 "indices are constant\n");
8018 return SDValue();
8019 }
8020
8021 // Add this element source to the list if it's not already there.
8022 SDValue SourceVec = V.getOperand(0);
8023 auto Source = find(Sources, SourceVec);
8024 if (Source == Sources.end())
8025 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
8026
8027 // Update the minimum and maximum lane number seen.
8028 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
8029 Source->MinElt = std::min(Source->MinElt, EltNo);
8030 Source->MaxElt = std::max(Source->MaxElt, EltNo);
8031 }
8032
8033 if (Sources.size() > 2) {
8034 LLVM_DEBUG(
8035 dbgs() << "Reshuffle failed: currently only do something sane when at "
8036 "most two source vectors are involved\n");
8037 return SDValue();
8038 }
8039
8040 // Find out the smallest element size among result and two sources, and use
8041 // it as element size to build the shuffle_vector.
8043 for (auto &Source : Sources) {
8044 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8045 if (SrcEltTy.bitsLT(SmallestEltTy)) {
8047 }
8048 }
8049 unsigned ResMultiplier =
8050 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
8051 uint64_t VTSize = VT.getFixedSizeInBits();
8052 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
8054
8055 // If the source vector is too wide or too narrow, we may nevertheless be able
8056 // to construct a compatible shuffle either by concatenating it with UNDEF or
8057 // extracting a suitable range of elements.
8058 for (auto &Src : Sources) {
8059 EVT SrcVT = Src.ShuffleVec.getValueType();
8060
8061 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8062 if (SrcVTSize == VTSize)
8063 continue;
8064
8065 // This stage of the search produces a source with the same element type as
8066 // the original, but with a total width matching the BUILD_VECTOR output.
8067 EVT EltVT = SrcVT.getVectorElementType();
8068 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8070
8071 if (SrcVTSize < VTSize) {
8072 assert(2 * SrcVTSize == VTSize);
8073 // We can pad out the smaller vector for free, so if it's part of a
8074 // shuffle...
8075 Src.ShuffleVec =
8076 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8077 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8078 continue;
8079 }
8080
8081 if (SrcVTSize != 2 * VTSize) {
8082 LLVM_DEBUG(
8083 dbgs() << "Reshuffle failed: result vector too small to extract\n");
8084 return SDValue();
8085 }
8086
8087 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8088 LLVM_DEBUG(
8089 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
8090 return SDValue();
8091 }
8092
8093 if (Src.MinElt >= NumSrcElts) {
8094 // The extraction can just take the second half
8095 Src.ShuffleVec =
8096 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8097 DAG.getConstant(NumSrcElts, dl, MVT::i64));
8098 Src.WindowBase = -NumSrcElts;
8099 } else if (Src.MaxElt < NumSrcElts) {
8100 // The extraction can just take the first half
8101 Src.ShuffleVec =
8102 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8103 DAG.getConstant(0, dl, MVT::i64));
8104 } else {
8105 // An actual VEXT is needed
8107 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8108 DAG.getConstant(0, dl, MVT::i64));
8110 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8111 DAG.getConstant(NumSrcElts, dl, MVT::i64));
8112 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
8113
8114 if (!SrcVT.is64BitVector()) {
8115 LLVM_DEBUG(
8116 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
8117 "for SVE vectors.");
8118 return SDValue();
8119 }
8120
8121 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
8122 VEXTSrc2,
8123 DAG.getConstant(Imm, dl, MVT::i32));
8124 Src.WindowBase = -Src.MinElt;
8125 }
8126 }
8127
8128 // Another possible incompatibility occurs from the vector element types. We
8129 // can fix this by bitcasting the source vectors to the same type we intend
8130 // for the shuffle.
8131 for (auto &Src : Sources) {
8132 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8133 if (SrcEltTy == SmallestEltTy)
8134 continue;
8135 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8136 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
8137 Src.WindowScale =
8138 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
8139 Src.WindowBase *= Src.WindowScale;
8140 }
8141
8142 // Final sanity check before we try to actually produce a shuffle.
8143 LLVM_DEBUG(for (auto Src
8144 : Sources)
8145 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
8146
8147 // The stars all align, our next step is to produce the mask for the shuffle.
8148 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8149 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8150 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8151 SDValue Entry = Op.getOperand(i);
8152 if (Entry.isUndef())
8153 continue;
8154
8155 auto Src = find(Sources, Entry.getOperand(0));
8156 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8157
8158 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8159 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8160 // segment.
8161 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8162 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8163 VT.getScalarSizeInBits());
8165
8166 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8167 // starting at the appropriate offset.
8168 int *LaneMask = &Mask[i * ResMultiplier];
8169
8170 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8171 ExtractBase += NumElts * (Src - Sources.begin());
8172 for (int j = 0; j < LanesDefined; ++j)
8173 LaneMask[j] = ExtractBase + j;
8174 }
8175
8176 // Final check before we try to produce nonsense...
8177 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
8178 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
8179 return SDValue();
8180 }
8181
8183 for (unsigned i = 0; i < Sources.size(); ++i)
8184 ShuffleOps[i] = Sources[i].ShuffleVec;
8185
8186 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8187 ShuffleOps[1], Mask);
8188 SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
8189
8190 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
8191 dbgs() << "Reshuffle, creating node: "; V.dump(););
8192
8193 return V;
8194}
8195
8196// check if an EXT instruction can handle the shuffle mask when the
8197// vector sources of the shuffle are the same.
8198static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
8199 unsigned NumElts = VT.getVectorNumElements();
8200
8201 // Assume that the first shuffle index is not UNDEF. Fail if it is.
8202 if (M[0] < 0)
8203 return false;
8204
8205 Imm = M[0];
8206
8207 // If this is a VEXT shuffle, the immediate value is the index of the first
8208 // element. The other shuffle indices must be the successive elements after
8209 // the first one.
8210 unsigned ExpectedElt = Imm;
8211 for (unsigned i = 1; i < NumElts; ++i) {
8212 // Increment the expected index. If it wraps around, just follow it
8213 // back to index zero and keep going.
8214 ++ExpectedElt;
8215 if (ExpectedElt == NumElts)
8216 ExpectedElt = 0;
8217
8218 if (M[i] < 0)
8219 continue; // ignore UNDEF indices
8220 if (ExpectedElt != static_cast<unsigned>(M[i]))
8221 return false;
8222 }
8223
8224 return true;
8225}
8226
8227/// Check if a vector shuffle corresponds to a DUP instructions with a larger
8228/// element width than the vector lane type. If that is the case the function
8229/// returns true and writes the value of the DUP instruction lane operand into
8230/// DupLaneOp
8231static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
8232 unsigned &DupLaneOp) {
8233 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
8234 "Only possible block sizes for wide DUP are: 16, 32, 64");
8235
8236 if (BlockSize <= VT.getScalarSizeInBits())
8237 return false;
8238 if (BlockSize % VT.getScalarSizeInBits() != 0)
8239 return false;
8240 if (VT.getSizeInBits() % BlockSize != 0)
8241 return false;
8242
8245 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
8246
8247 // We are looking for masks like
8248 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
8249 // might be replaced by 'undefined'. BlockIndices will eventually contain
8250 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
8251 // for the above examples)
8253 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
8254 for (size_t I = 0; I < NumEltsPerBlock; I++) {
8255 int Elt = M[BlockIndex * NumEltsPerBlock + I];
8256 if (Elt < 0)
8257 continue;
8258 // For now we don't support shuffles that use the second operand
8259 if ((unsigned)Elt >= SingleVecNumElements)
8260 return false;
8261 if (BlockElts[I] < 0)
8262 BlockElts[I] = Elt;
8263 else if (BlockElts[I] != Elt)
8264 return false;
8265 }
8266
8267 // We found a candidate block (possibly with some undefs). It must be a
8268 // sequence of consecutive integers starting with a value divisible by
8269 // NumEltsPerBlock with some values possibly replaced by undef-s.
8270
8271 // Find first non-undef element
8272 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
8274 "Shuffle with all-undefs must have been caught by previous cases, "
8275 "e.g. isSplat()");
8276 if (FirstRealEltIter == BlockElts.end()) {
8277 DupLaneOp = 0;
8278 return true;
8279 }
8280
8281 // Index of FirstRealElt in BlockElts
8282 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
8283
8284 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
8285 return false;
8286 // BlockElts[0] must have the following value if it isn't undef:
8288
8289 // Check the first element
8290 if (Elt0 % NumEltsPerBlock != 0)
8291 return false;
8292 // Check that the sequence indeed consists of consecutive integers (modulo
8293 // undefs)
8294 for (size_t I = 0; I < NumEltsPerBlock; I++)
8295 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
8296 return false;
8297
8299 return true;
8300}
8301
8302// check if an EXT instruction can handle the shuffle mask when the
8303// vector sources of the shuffle are different.
8304static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
8305 unsigned &Imm) {
8306 // Look for the first non-undef element.
8307 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
8308
8309 // Benefit form APInt to handle overflow when calculating expected element.
8310 unsigned NumElts = VT.getVectorNumElements();
8311 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
8313 // The following shuffle indices must be the successive elements after the
8314 // first real element.
8315 const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
8316 [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
8317 if (FirstWrongElt != M.end())
8318 return false;
8319
8320 // The index of an EXT is the first element if it is not UNDEF.
8321 // Watch out for the beginning UNDEFs. The EXT index should be the expected
8322 // value of the first element. E.g.
8323 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
8324 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
8325 // ExpectedElt is the last mask index plus 1.
8326 Imm = ExpectedElt.getZExtValue();
8327
8328 // There are two difference cases requiring to reverse input vectors.
8329 // For example, for vector <4 x i32> we have the following cases,
8330 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
8331 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
8332 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
8333 // to reverse two input vectors.
8334 if (Imm < NumElts)
8335 ReverseEXT = true;
8336 else
8337 Imm -= NumElts;
8338
8339 return true;
8340}
8341
8342/// isREVMask - Check if a vector shuffle corresponds to a REV
8343/// instruction with the specified blocksize. (The order of the elements
8344/// within each block of the vector is reversed.)
8345static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
8346 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
8347 "Only possible block sizes for REV are: 16, 32, 64");
8348
8349 unsigned EltSz = VT.getScalarSizeInBits();
8350 if (EltSz == 64)
8351 return false;
8352
8353 unsigned NumElts = VT.getVectorNumElements();
8354 unsigned BlockElts = M[0] + 1;
8355 // If the first shuffle index is UNDEF, be optimistic.
8356 if (M[0] < 0)
8358
8359 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
8360 return false;
8361
8362 for (unsigned i = 0; i < NumElts; ++i) {
8363 if (M[i] < 0)
8364 continue; // ignore UNDEF indices
8365 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
8366 return false;
8367 }
8368
8369 return true;
8370}
8371
8372static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8373 unsigned NumElts = VT.getVectorNumElements();
8374 if (NumElts % 2 != 0)
8375 return false;
8376 WhichResult = (M[0] == 0 ? 0 : 1);
8377 unsigned Idx = WhichResult * NumElts / 2;
8378 for (unsigned i = 0; i != NumElts; i += 2) {
8379 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
8380 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
8381 return false;
8382 Idx += 1;
8383 }
8384
8385 return true;
8386}
8387
8388static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8389 unsigned NumElts = VT.getVectorNumElements();
8390 WhichResult = (M[0] == 0 ? 0 : 1);
8391 for (unsigned i = 0; i != NumElts; ++i) {
8392 if (M[i] < 0)
8393 continue; // ignore UNDEF indices
8394 if ((unsigned)M[i] != 2 * i + WhichResult)
8395 return false;
8396 }
8397
8398 return true;
8399}
8400
8401static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8402 unsigned NumElts = VT.getVectorNumElements();
8403 if (NumElts % 2 != 0)
8404 return false;
8405 WhichResult = (M[0] == 0 ? 0 : 1);
8406 for (unsigned i = 0; i < NumElts; i += 2) {
8407 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
8408 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
8409 return false;
8410 }
8411 return true;
8412}
8413
8414/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
8415/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
8416/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
8417static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8418 unsigned NumElts = VT.getVectorNumElements();
8419 if (NumElts % 2 != 0)
8420 return false;
8421 WhichResult = (M[0] == 0 ? 0 : 1);
8422 unsigned Idx = WhichResult * NumElts / 2;
8423 for (unsigned i = 0; i != NumElts; i += 2) {
8424 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
8425 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
8426 return false;
8427 Idx += 1;
8428 }
8429
8430 return true;
8431}
8432
8433/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
8434/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
8435/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
8436static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8437 unsigned Half = VT.getVectorNumElements() / 2;
8438 WhichResult = (M[0] == 0 ? 0 : 1);
8439 for (unsigned j = 0; j != 2; ++j) {
8440 unsigned Idx = WhichResult;
8441 for (unsigned i = 0; i != Half; ++i) {
8442 int MIdx = M[i + j * Half];
8443 if (MIdx >= 0 && (unsigned)MIdx != Idx)
8444 return false;
8445 Idx += 2;
8446 }
8447 }
8448
8449 return true;
8450}
8451
8452/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
8453/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
8454/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
8455static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8456 unsigned NumElts = VT.getVectorNumElements();
8457 if (NumElts % 2 != 0)
8458 return false;
8459 WhichResult = (M[0] == 0 ? 0 : 1);
8460 for (unsigned i = 0; i < NumElts; i += 2) {
8461 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
8462 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
8463 return false;
8464 }
8465 return true;
8466}
8467
8469 bool &DstIsLeft, int &Anomaly) {
8470 if (M.size() != static_cast<size_t>(NumInputElements))
8471 return false;
8472
8473 int NumLHSMatch = 0, NumRHSMatch = 0;
8474 int LastLHSMismatch = -1, LastRHSMismatch = -1;
8475
8476 for (int i = 0; i < NumInputElements; ++i) {
8477 if (M[i] == -1) {
8478 ++NumLHSMatch;
8479 ++NumRHSMatch;
8480 continue;
8481 }
8482
8483 if (M[i] == i)
8484 ++NumLHSMatch;
8485 else
8487
8488 if (M[i] == i + NumInputElements)
8489 ++NumRHSMatch;
8490 else
8492 }
8493
8494 if (NumLHSMatch == NumInputElements - 1) {
8495 DstIsLeft = true;
8497 return true;
8498 } else if (NumRHSMatch == NumInputElements - 1) {
8499 DstIsLeft = false;
8501 return true;
8502 }
8503
8504 return false;
8505}
8506
8507static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
8508 if (VT.getSizeInBits() != 128)
8509 return false;
8510
8511 unsigned NumElts = VT.getVectorNumElements();
8512
8513 for (int I = 0, E = NumElts / 2; I != E; I++) {
8514 if (Mask[I] != I)
8515 return false;
8516 }
8517
8518 int Offset = NumElts / 2;
8519 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
8520 if (Mask[I] != I + SplitLHS * Offset)
8521 return false;
8522 }
8523
8524 return true;
8525}
8526
8528 SDLoc DL(Op);
8529 EVT VT = Op.getValueType();
8530 SDValue V0 = Op.getOperand(0);
8531 SDValue V1 = Op.getOperand(1);
8532 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
8533
8534 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
8535 VT.getVectorElementType() != V1.getValueType().getVectorElementType())
8536 return SDValue();
8537
8538 bool SplitV0 = V0.getValueSizeInBits() == 128;
8539
8540 if (!isConcatMask(Mask, VT, SplitV0))
8541 return SDValue();
8542
8544 if (SplitV0) {
8546 DAG.getConstant(0, DL, MVT::i64));
8547 }
8548 if (V1.getValueSizeInBits() == 128) {
8550 DAG.getConstant(0, DL, MVT::i64));
8551 }
8552 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
8553}
8554
8555/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8556/// the specified operations to build the shuffle.
8558 SDValue RHS, SelectionDAG &DAG,
8559 const SDLoc &dl) {
8560 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8561 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
8562 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
8563
8564 enum {
8565 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8566 OP_VREV,
8567 OP_VDUP0,
8568 OP_VDUP1,
8569 OP_VDUP2,
8570 OP_VDUP3,
8571 OP_VEXT1,
8572 OP_VEXT2,
8573 OP_VEXT3,
8574 OP_VUZPL, // VUZP, left result
8575 OP_VUZPR, // VUZP, right result
8576 OP_VZIPL, // VZIP, left result
8577 OP_VZIPR, // VZIP, right result
8578 OP_VTRNL, // VTRN, left result
8579 OP_VTRNR // VTRN, right result
8580 };
8581
8582 if (OpNum == OP_COPY) {
8583 if (LHSID == (1 * 9 + 2) * 9 + 3)
8584 return LHS;
8585 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
8586 return RHS;
8587 }
8588
8592 EVT VT = OpLHS.getValueType();
8593
8594 switch (OpNum) {
8595 default:
8596 llvm_unreachable("Unknown shuffle opcode!");
8597 case OP_VREV:
8598 // VREV divides the vector in half and swaps within the half.
8599 if (VT.getVectorElementType() == MVT::i32 ||
8601 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
8602 // vrev <4 x i16> -> REV32
8603 if (VT.getVectorElementType() == MVT::i16 ||
8606 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
8607 // vrev <4 x i8> -> REV16
8609 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
8610 case OP_VDUP0:
8611 case OP_VDUP1:
8612 case OP_VDUP2:
8613 case OP_VDUP3: {
8614 EVT EltTy = VT.getVectorElementType();
8615 unsigned Opcode;
8616 if (EltTy == MVT::i8)
8617 Opcode = AArch64ISD::DUPLANE8;
8618 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
8619 Opcode = AArch64ISD::DUPLANE16;
8620 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
8621 Opcode = AArch64ISD::DUPLANE32;
8622 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
8623 Opcode = AArch64ISD::DUPLANE64;
8624 else
8625 llvm_unreachable("Invalid vector element type?");
8626
8627 if (VT.getSizeInBits() == 64)
8628 OpLHS = WidenVector(OpLHS, DAG);
8629 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
8630 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
8631 }
8632 case OP_VEXT1:
8633 case OP_VEXT2:
8634 case OP_VEXT3: {
8635 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
8636 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
8637 DAG.getConstant(Imm, dl, MVT::i32));
8638 }
8639 case OP_VUZPL:
8640 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
8641 OpRHS);
8642 case OP_VUZPR:
8643 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
8644 OpRHS);
8645 case OP_VZIPL:
8646 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
8647 OpRHS);
8648 case OP_VZIPR:
8649 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
8650 OpRHS);
8651 case OP_VTRNL:
8652 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
8653 OpRHS);
8654 case OP_VTRNR:
8655 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
8656 OpRHS);
8657 }
8658}
8659
8661 SelectionDAG &DAG) {
8662 // Check to see if we can use the TBL instruction.
8663 SDValue V1 = Op.getOperand(0);
8664 SDValue V2 = Op.getOperand(1);
8665 SDLoc DL(Op);
8666
8667 EVT EltVT = Op.getValueType().getVectorElementType();
8668 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
8669
8671 for (int Val : ShuffleMask) {
8672 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
8673 unsigned Offset = Byte + Val * BytesPerElt;
8674 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
8675 }
8676 }
8677
8679 unsigned IndexLen = 8;
8680 if (Op.getValueSizeInBits() == 128) {
8682 IndexLen = 16;
8683 }
8684
8687
8688 SDValue Shuffle;
8689 if (V2.getNode()->isUndef()) {
8690 if (IndexLen == 8)
8692 Shuffle = DAG.getNode(
8694 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
8696 makeArrayRef(TBLMask.data(), IndexLen)));
8697 } else {
8698 if (IndexLen == 8) {
8700 Shuffle = DAG.getNode(
8702 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
8704 makeArrayRef(TBLMask.data(), IndexLen)));
8705 } else {
8706 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
8707 // cannot currently represent the register constraints on the input
8708 // table registers.
8709 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
8710 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
8711 // IndexLen));
8712 Shuffle = DAG.getNode(
8714 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
8716 makeArrayRef(TBLMask.data(), IndexLen)));
8717 }
8718 }
8719 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
8720}
8721
8722static unsigned getDUPLANEOp(EVT EltType) {
8723 if (EltType == MVT::i8)
8724 return AArch64ISD::DUPLANE8;
8725 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
8726 return AArch64ISD::DUPLANE16;
8727 if (EltType == MVT::i32 || EltType == MVT::f32)
8728 return AArch64ISD::DUPLANE32;
8729 if (EltType == MVT::i64 || EltType == MVT::f64)
8730 return AArch64ISD::DUPLANE64;
8731
8732 llvm_unreachable("Invalid vector element type?");
8733}
8734
8735static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
8736 unsigned Opcode, SelectionDAG &DAG) {
8737 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
8738 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
8739 // Match: dup (bitcast (extract_subv X, C)), LaneC
8740 if (BitCast.getOpcode() != ISD::BITCAST ||
8741 BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
8742 return false;
8743
8744 // The extract index must align in the destination type. That may not
8745 // happen if the bitcast is from narrow to wide type.
8746 SDValue Extract = BitCast.getOperand(0);
8747 unsigned ExtIdx = Extract.getConstantOperandVal(1);
8748 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
8749 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
8750 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
8752 return false;
8753
8754 // Update the lane value by offsetting with the scaled extract index.
8756
8757 // Determine the casted vector type of the wide vector input.
8758 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
8759 // Examples:
8760 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
8761 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
8762 unsigned SrcVecNumElts =
8764 CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
8766 return true;
8767 };
8768 MVT CastVT;
8769 if (getScaledOffsetDup(V, Lane, CastVT)) {
8770 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
8771 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
8772 // The lane is incremented by the index of the extract.
8773 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
8774 Lane += V.getConstantOperandVal(1);
8775 V = V.getOperand(0);
8776 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
8777 // The lane is decremented if we are splatting from the 2nd operand.
8778 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
8779 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
8780 Lane -= Idx * VT.getVectorNumElements() / 2;
8781 V = WidenVector(V.getOperand(Idx), DAG);
8782 } else if (VT.getSizeInBits() == 64) {
8783 // Widen the operand to 128-bit register with undef.
8784 V = WidenVector(V, DAG);
8785 }
8786 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
8787}
8788
8789SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
8790 SelectionDAG &DAG) const {
8791 SDLoc dl(Op);
8792 EVT VT = Op.getValueType();
8793
8795
8796 // Convert shuffles that are directly supported on NEON to target-specific
8797 // DAG nodes, instead of keeping them as shuffles and matching them again
8798 // during code selection. This is more efficient and avoids the possibility
8799 // of inconsistencies between legalization and selection.
8800 ArrayRef<int> ShuffleMask = SVN->getMask();
8801
8802 SDValue V1 = Op.getOperand(0);
8803 SDValue V2 = Op.getOperand(1);
8804
8805 if (SVN->isSplat()) {
8806 int Lane = SVN->getSplatIndex();
8807 // If this is undef splat, generate it via "just" vdup, if possible.
8808 if (Lane == -1)
8809 Lane = 0;
8810
8811 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
8812 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
8813 V1.getOperand(0));
8814 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
8815 // constant. If so, we can just reference the lane's definition directly.
8816 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
8817 !isa<ConstantSDNode>(V1.getOperand(Lane)))
8818 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
8819
8820 // Otherwise, duplicate from the lane of the input vector.
8821 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
8822 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
8823 }
8824
8825 // Check if the mask matches a DUP for a wider element
8826 for (unsigned LaneSize : {64U, 32U, 16U}) {
8827 unsigned Lane = 0;
8828 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
8829 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
8832 // Cast V1 to an integer vector with required lane size
8834 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
8836 V1 = DAG.getBitcast(NewVecTy, V1);
8837 // Constuct the DUP instruction
8838 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
8839 // Cast back to the original type
8840 return DAG.getBitcast(VT, V1);
8841 }
8842 }
8843
8844 if (isREVMask(ShuffleMask, VT, 64))
8845 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
8846 if (isREVMask(ShuffleMask, VT, 32))
8847 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
8848 if (isREVMask(ShuffleMask, VT, 16))
8849 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
8850
8851 bool ReverseEXT = false;
8852 unsigned Imm;
8853 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
8854 if (ReverseEXT)
8855 std::swap(V1, V2);
8856 Imm *= getExtFactor(V1);
8857 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
8858 DAG.getConstant(Imm, dl, MVT::i32));
8859 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
8860 Imm *= getExtFactor(V1);
8861 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
8862 DAG.getConstant(Imm, dl, MVT::i32));
8863 }
8864
8865 unsigned WhichResult;
8866 if (isZIPMask(ShuffleMask, VT, WhichResult)) {
8867 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
8868 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
8869 }
8870 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
8871 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
8872 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
8873 }
8874 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
8875 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
8876 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
8877 }
8878
8879 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
8880 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
8881 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
8882 }
8883 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
8884 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
8885 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
8886 }
8887 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
8888 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
8889 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
8890 }
8891
8893 return Concat;
8894
8895 bool DstIsLeft;
8896 int Anomaly;
8897 int NumInputElements = V1.getValueType().getVectorNumElements();
8898 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
8901
8902 SDValue SrcVec = V1;
8903 int SrcLane = ShuffleMask[Anomaly];
8904 if (SrcLane >= NumInputElements) {
8905 SrcVec = V2;
8907 }
8909
8911
8912 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
8914
8915 return DAG.getNode(
8918 DstLaneV);
8919 }
8920
8921 // If the shuffle is not directly supported and it has 4 elements, use
8922 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8923 unsigned NumElts = VT.getVectorNumElements();
8924 if (NumElts == 4) {
8925 unsigned PFIndexes[4];
8926 for (unsigned i = 0; i != 4; ++i) {
8927 if (ShuffleMask[i] < 0)
8928 PFIndexes[i] = 8;
8929 else
8930 PFIndexes[i] = ShuffleMask[i];
8931 }
8932
8933 // Compute the index in the perfect shuffle table.
8934 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
8935 PFIndexes[2] * 9 + PFIndexes[3];
8937 unsigned Cost = (PFEntry >> 30);
8938
8939 if (Cost <= 4)
8940 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8941 }
8942
8943 return GenerateTBL(Op, ShuffleMask, DAG);
8944}
8945
8946SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
8947 SelectionDAG &DAG) const {
8948 SDLoc dl(Op);
8949 EVT VT = Op.getValueType();
8950 EVT ElemVT = VT.getScalarType();
8951 SDValue SplatVal = Op.getOperand(0);
8952
8953 if (useSVEForFixedLengthVectorVT(VT))
8954 return LowerToScalableOp(Op, DAG);
8955
8956 // Extend input splat value where needed to fit into a GPR (32b or 64b only)
8957 // FPRs don't have this restriction.
8958 switch (ElemVT.getSimpleVT().SimpleTy) {
8959 case MVT::i1: {
8960 // The only legal i1 vectors are SVE vectors, so we can use SVE-specific
8961 // lowering code.
8962 if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
8963 if (ConstVal->isOne())
8964 return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
8965 // TODO: Add special case for constant false
8966 }
8967 // The general case of i1. There isn't any natural way to do this,
8968 // so we use some trickery with whilelo.
8971 DAG.getValueType(MVT::i1));
8972 SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl,
8973 MVT::i64);
8974 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
8975 DAG.getConstant(0, dl, MVT::i64), SplatVal);
8976 }
8977 case MVT::i8:
8978 case MVT::i16:
8979 case MVT::i32:
8981 break;
8982 case MVT::i64:
8984 break;
8985 case MVT::f16:
8986 case MVT::bf16:
8987 case MVT::f32:
8988 case MVT::f64:
8989 // Fine as is
8990 break;
8991 default:
8992 report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
8993 }
8994
8995 return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
8996}
8997
8998SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
8999 SelectionDAG &DAG) const {
9000 SDLoc DL(Op);
9001
9002 EVT VT = Op.getValueType();
9003 if (!isTypeLegal(VT) || !VT.isScalableVector())
9004 return SDValue();
9005
9006 // Current lowering only supports the SVE-ACLE types.
9008 return SDValue();
9009
9010 // The DUPQ operation is indepedent of element type so normalise to i64s.
9011 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
9012 SDValue Idx128 = Op.getOperand(2);
9013
9014 // DUPQ can be used when idx is in range.
9016 if (CIdx && (CIdx->getZExtValue() <= 3)) {
9017 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
9018 SDNode *DUPQ =
9019 DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI);
9020 return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0));
9021 }
9022
9023 // The ACLE says this must produce the same result as:
9024 // svtbl(data, svadd_x(svptrue_b64(),
9025 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
9026 // index * 2))
9027 SDValue One = DAG.getConstant(1, DL, MVT::i64);
9029
9030 // create the vector 0,1,0,1,...
9031 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
9033 DL, MVT::nxv2i64, Zero, One);
9034 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
9035
9036 // create the vector idx64,idx64+1,idx64,idx64+1,...
9039 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
9040
9041 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
9042 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
9043 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
9044}
9045
9046
9048 APInt &UndefBits) {
9049 EVT VT = BVN->getValueType(0);
9050 APInt SplatBits, SplatUndef;
9051 unsigned SplatBitSize;
9052 bool HasAnyUndefs;
9053 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
9054 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
9055
9056 for (unsigned i = 0; i < NumSplats; ++i) {
9057 CnstBits <<= SplatBitSize;
9058 UndefBits <<= SplatBitSize;
9059 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
9060 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
9061 }
9062
9063 return true;
9064 }
9065
9066 return false;
9067}
9068
9069// Try 64-bit splatted SIMD immediate.
9071 const APInt &Bits) {
9072 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9073 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9074 EVT VT = Op.getValueType();
9075 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
9076
9079
9080 SDLoc dl(Op);
9081 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9082 DAG.getConstant(Value, dl, MVT::i32));
9083 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9084 }
9085 }
9086
9087 return SDValue();
9088}
9089
9090// Try 32-bit splatted SIMD immediate.
9092 const APInt &Bits,
9093 const SDValue *LHS = nullptr) {
9094 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9095 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9096 EVT VT = Op.getValueType();
9097 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
9098 bool isAdvSIMDModImm = false;
9099 uint64_t Shift;
9100
9103 Shift = 0;
9104 }
9107 Shift = 8;
9108 }
9111 Shift = 16;
9112 }
9115 Shift = 24;
9116 }
9117
9118 if (isAdvSIMDModImm) {
9119 SDLoc dl(Op);
9120 SDValue Mov;
9121
9122 if (LHS)
9123 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
9124 DAG.getConstant(Value, dl, MVT::i32),
9125 DAG.getConstant(Shift, dl, MVT::i32));
9126 else
9127 Mov = DAG.getNode(NewOp, dl, MovTy,
9128 DAG.getConstant(Value, dl, MVT::i32),
9129 DAG.getConstant(Shift, dl, MVT::i32));
9130
9131 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9132 }
9133 }
9134
9135 return SDValue();
9136}
9137
9138// Try 16-bit splatted SIMD immediate.
9140 const APInt &Bits,
9141 const SDValue *LHS = nullptr) {
9142 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9143 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9144 EVT VT = Op.getValueType();
9145 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
9146 bool isAdvSIMDModImm = false;
9147 uint64_t Shift;
9148
9151 Shift = 0;
9152 }
9155 Shift = 8;
9156 }
9157
9158 if (isAdvSIMDModImm) {
9159 SDLoc dl(Op);
9160 SDValue Mov;
9161
9162 if (LHS)
9163 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
9164 DAG.getConstant(Value, dl, MVT::i32),
9165 DAG.getConstant(Shift, dl, MVT::i32));
9166 else
9167 Mov = DAG.getNode(NewOp, dl, MovTy,
9168 DAG.getConstant(Value, dl, MVT::i32),
9169 DAG.getConstant(Shift, dl, MVT::i32));
9170
9171 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9172 }
9173 }
9174
9175 return SDValue();
9176}
9177
9178// Try 32-bit splatted SIMD immediate with shifted ones.
9180 SelectionDAG &DAG, const APInt &Bits) {
9181 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9182 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9183 EVT VT = Op.getValueType();
9184 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
9185 bool isAdvSIMDModImm = false;
9186 uint64_t Shift;
9187
9190 Shift = 264;
9191 }
9194 Shift = 272;
9195 }
9196
9197 if (isAdvSIMDModImm) {
9198 SDLoc dl(Op);
9199 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9200 DAG.getConstant(Value, dl, MVT::i32),
9201 DAG.getConstant(Shift, dl, MVT::i32));
9202 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9203 }
9204 }
9205
9206 return SDValue();
9207}
9208
9209// Try 8-bit splatted SIMD immediate.
9211 const APInt &Bits) {
9212 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9213 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9214 EVT VT = Op.getValueType();
9215 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
9216
9219
9220 SDLoc dl(Op);
9221 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9222 DAG.getConstant(Value, dl, MVT::i32));
9223 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9224 }
9225 }
9226
9227 return SDValue();
9228}
9229
9230// Try FP splatted SIMD immediate.
9232 const APInt &Bits) {
9233 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9234 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9235 EVT VT = Op.getValueType();
9236 bool isWide = (VT.getSizeInBits() == 128);
9237 MVT MovTy;
9238 bool isAdvSIMDModImm = false;
9239
9243 }
9244 else if (isWide &&
9247 MovTy = MVT::v2f64;
9248 }
9249
9250 if (isAdvSIMDModImm) {
9251 SDLoc dl(Op);
9252 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9253 DAG.getConstant(Value, dl, MVT::i32));
9254 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9255 }
9256 }
9257
9258 return SDValue();
9259}
9260
9261// Specialized code to quickly find if PotentialBVec is a BuildVector that
9262// consists of only the same constant int value, returned in reference arg
9263// ConstVal
9265 uint64_t &ConstVal) {
9267 if (!Bvec)
9268 return false;
9270 if (!FirstElt)
9271 return false;
9272 EVT VT = Bvec->getValueType(0);
9273 unsigned NumElts = VT.getVectorNumElements();
9274 for (unsigned i = 1; i < NumElts; ++i)
9275 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
9276 return false;
9277 ConstVal = FirstElt->getZExtValue();
9278 return true;
9279}
9280
9281static unsigned getIntrinsicID(const SDNode *N) {
9282 unsigned Opcode = N->getOpcode();
9283 switch (Opcode) {
9284 default:
9287 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
9288 if (IID < Intrinsic::num_intrinsics)
9289 return IID;
9291 }
9292 }
9293}
9294
9295// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
9296// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
9297// BUILD_VECTORs with constant element C1, C2 is a constant, and:
9298// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
9299// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
9300// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
9302 EVT VT = N->getValueType(0);
9303
9304 if (!VT.isVector())
9305 return SDValue();
9306
9307 SDLoc DL(N);
9308
9309 SDValue And;
9310 SDValue Shift;
9311
9312 SDValue FirstOp = N->getOperand(0);
9313 unsigned FirstOpc = FirstOp.getOpcode();
9314 SDValue SecondOp = N->getOperand(1);
9315 unsigned SecondOpc = SecondOp.getOpcode();
9316
9317 // Is one of the operands an AND or a BICi? The AND may have been optimised to
9318 // a BICi in order to use an immediate instead of a register.
9319 // Is the other operand an shl or lshr? This will have been turned into:
9320 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
9321 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
9323 And = FirstOp;
9324 Shift = SecondOp;
9325
9326 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
9328 And = SecondOp;
9329 Shift = FirstOp;
9330 } else
9331 return SDValue();
9332
9333 bool IsAnd = And.getOpcode() == ISD::AND;
9334 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
9335
9336 // Is the shift amount constant?
9338 if (!C2node)
9339 return SDValue();
9340
9341 uint64_t C1;
9342 if (IsAnd) {
9343 // Is the and mask vector all constant?
9344 if (!isAllConstantBuildVector(And.getOperand(1), C1))
9345 return SDValue();
9346 } else {
9347 // Reconstruct the corresponding AND immediate from the two BICi immediates.
9351 C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
9352 }
9353
9354 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
9355 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
9356 // how much one can shift elements of a particular size?
9357 uint64_t C2 = C2node->getZExtValue();
9358 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
9359 if (C2 > ElemSizeInBits)
9360 return SDValue();
9361
9365 if (C1AsAPInt != RequiredC1)
9366 return SDValue();
9367
9368 SDValue X = And.getOperand(0);
9369 SDValue Y = Shift.getOperand(0);
9370
9372 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
9373
9374 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
9375 LLVM_DEBUG(N->dump(&DAG));
9376 LLVM_DEBUG(dbgs() << "into: \n");
9377 LLVM_DEBUG(ResultSLI->dump(&DAG));
9378
9380 return ResultSLI;
9381}
9382
9383SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
9384 SelectionDAG &DAG) const {
9385 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
9386 return LowerToScalableOp(Op, DAG);
9387
9388 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
9389 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
9390 return Res;
9391
9392 EVT VT = Op.getValueType();
9393
9394 SDValue LHS = Op.getOperand(0);
9396 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
9397 if (!BVN) {
9398 // OR commutes, so try swapping the operands.
9399 LHS = Op.getOperand(1);
9400 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
9401 }
9402 if (!BVN)
9403 return Op;
9404
9405 APInt DefBits(VT.getSizeInBits(), 0);
9406 APInt UndefBits(VT.getSizeInBits(), 0);
9408 SDValue NewOp;
9409
9411 DefBits, &LHS)) ||
9413 DefBits, &LHS)))
9414 return NewOp;
9415
9417 UndefBits, &LHS)) ||
9419 UndefBits, &LHS)))
9420 return NewOp;
9421 }
9422
9423 // We can always fall back to a non-immediate OR.
9424 return Op;
9425}
9426
9427// Normalize the operands of BUILD_VECTOR. The value of constant operands will
9428// be truncated to fit element width.
9430 SelectionDAG &DAG) {
9431 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
9432 SDLoc dl(Op);
9433 EVT VT = Op.getValueType();
9434 EVT EltTy= VT.getVectorElementType();
9435
9436 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
9437 return Op;
9438
9440 for (SDValue Lane : Op->ops()) {
9441 // For integer vectors, type legalization would have promoted the
9442 // operands already. Otherwise, if Op is a floating-point splat
9443 // (with operands cast to integers), then the only possibilities
9444 // are constants and UNDEFs.
9445 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
9446 APInt LowBits(EltTy.getSizeInBits(),
9447 CstLane->getZExtValue());
9448 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
9449 } else if (Lane.getNode()->isUndef()) {
9450 Lane = DAG.getUNDEF(MVT::i32);
9451 } else {
9452 assert(Lane.getValueType() == MVT::i32 &&
9453 "Unexpected BUILD_VECTOR operand type");
9454 }
9455 Ops.push_back(Lane);
9456 }
9457 return DAG.getBuildVector(VT, dl, Ops);
9458}
9459
9461 EVT VT = Op.getValueType();
9462
9463 APInt DefBits(VT.getSizeInBits(), 0);
9464 APInt UndefBits(VT.getSizeInBits(), 0);
9467 SDValue NewOp;
9474 return NewOp;
9475
9476 DefBits = ~DefBits;
9480 return NewOp;
9481
9489 return NewOp;
9490
9495 return NewOp;
9496 }
9497
9498 return SDValue();
9499}
9500
9501SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
9502 SelectionDAG &DAG) const {
9503 EVT VT = Op.getValueType();
9504
9505 // Try to build a simple constant vector.
9506 Op = NormalizeBuildVector(Op, DAG);
9507 if (VT.isInteger()) {
9508 // Certain vector constants, used to express things like logical NOT and
9509 // arithmetic NEG, are passed through unmodified. This allows special
9510 // patterns for these operations to match, which will lower these constants
9511 // to whatever is proven necessary.
9513 if (BVN->isConstant())
9514 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
9515 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
9516 APInt Val(BitSize,
9517 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
9518 if (Val.isNullValue() || Val.isAllOnesValue())
9519 return Op;
9520 }
9521 }
9522
9523 if (SDValue V = ConstantBuildVector(Op, DAG))
9524 return V;
9525
9526 // Scan through the operands to find some interesting properties we can
9527 // exploit:
9528 // 1) If only one value is used, we can use a DUP, or
9529 // 2) if only the low element is not undef, we can just insert that, or
9530 // 3) if only one constant value is used (w/ some non-constant lanes),
9531 // we can splat the constant value into the whole vector then fill
9532 // in the non-constant lanes.
9533 // 4) FIXME: If different constant values are used, but we can intelligently
9534 // select the values we'll be overwriting for the non-constant
9535 // lanes such that we can directly materialize the vector
9536 // some other way (MOVI, e.g.), we can be sneaky.
9537 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
9538 SDLoc dl(Op);
9539 unsigned NumElts = VT.getVectorNumElements();
9540 bool isOnlyLowElement = true;
9541 bool usesOnlyOneValue = true;
9542 bool usesOnlyOneConstantValue = true;
9543 bool isConstant = true;
9544 bool AllLanesExtractElt = true;
9545 unsigned NumConstantLanes = 0;
9546 unsigned NumDifferentLanes = 0;
9547 unsigned NumUndefLanes = 0;
9548 SDValue Value;
9549 SDValue ConstantValue;
9550 for (unsigned i = 0; i < NumElts; ++i) {
9551 SDValue V = Op.getOperand(i);
9552 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9553 AllLanesExtractElt = false;
9554 if (V.isUndef()) {
9555 ++NumUndefLanes;
9556 continue;
9557 }
9558 if (i > 0)
9559 isOnlyLowElement = false;
9561 isConstant = false;
9562
9565 if (!ConstantValue.getNode())
9566 ConstantValue = V;
9567 else if (ConstantValue != V)
9569 }
9570
9571 if (!Value.getNode())
9572 Value = V;
9573 else if (V != Value) {
9574 usesOnlyOneValue = false;
9576 }
9577 }
9578
9579 if (!Value.getNode()) {
9580 LLVM_DEBUG(
9581 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
9582 return DAG.getUNDEF(VT);
9583 }
9584
9585 // Convert BUILD_VECTOR where all elements but the lowest are undef into
9586 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
9587 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
9589 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
9590 "SCALAR_TO_VECTOR node\n");
9591 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
9592 }
9593
9594 if (AllLanesExtractElt) {
9595 SDNode *Vector = nullptr;
9596 bool Even = false;
9597 bool Odd = false;
9598 // Check whether the extract elements match the Even pattern <0,2,4,...> or
9599 // the Odd pattern <1,3,5,...>.
9600 for (unsigned i = 0; i < NumElts; ++i) {
9601 SDValue V = Op.getOperand(i);
9602 const SDNode *N = V.getNode();
9603 if (!isa<ConstantSDNode>(N->getOperand(1)))
9604 break;
9605 SDValue N0 = N->getOperand(0);
9606
9607 // All elements are extracted from the same vector.
9608 if (!Vector) {
9609 Vector = N0.getNode();
9610 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
9611 // BUILD_VECTOR.
9612 if (VT.getVectorElementType() !=
9614 break;
9615 } else if (Vector != N0.getNode()) {
9616 Odd = false;
9617 Even = false;
9618 break;
9619 }
9620
9621 // Extracted values are either at Even indices <0,2,4,...> or at Odd
9622 // indices <1,3,5,...>.
9623 uint64_t Val = N->getConstantOperandVal(1);
9624 if (Val == 2 * i) {
9625 Even = true;
9626 continue;
9627 }
9628 if (Val - 1 == 2 * i) {
9629 Odd = true;
9630 continue;
9631 }
9632
9633 // Something does not match: abort.
9634 Odd = false;
9635 Even = false;
9636 break;
9637 }
9638 if (Even || Odd) {
9639 SDValue LHS =
9641 DAG.getConstant(0, dl, MVT::i64));
9642 SDValue RHS =
9644 DAG.getConstant(NumElts, dl, MVT::i64));
9645
9646 if (Even && !Odd)
9647 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
9648 RHS);
9649 if (Odd && !Even)
9650 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
9651 RHS);
9652 }
9653 }
9654
9655 // Use DUP for non-constant splats. For f32 constant splats, reduce to
9656 // i32 and try again.
9657 if (usesOnlyOneValue) {
9658 if (!isConstant) {
9659 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9660 Value.getValueType() != VT) {
9661 LLVM_DEBUG(
9662 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
9663 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
9664 }
9665
9666 // This is actually a DUPLANExx operation, which keeps everything vectory.
9667
9668 SDValue Lane = Value.getOperand(1);
9669 Value = Value.getOperand(0);
9670 if (Value.getValueSizeInBits() == 64) {
9671 LLVM_DEBUG(
9672 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
9673 "widening it\n");
9674 Value = WidenVector(Value, DAG);
9675 }
9676
9677 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
9678 return DAG.getNode(Opcode, dl, VT, Value, Lane);
9679 }
9680
9683 EVT EltTy = VT.getVectorElementType();
9684 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
9685 EltTy == MVT::f64) && "Unsupported floating-point vector type");
9686 LLVM_DEBUG(
9687 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
9688 "BITCASTS, and try again\n");
9689 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
9690 for (unsigned i = 0; i < NumElts; ++i)
9691 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
9692 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
9693 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
9694 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
9695 Val.dump(););
9696 Val = LowerBUILD_VECTOR(Val, DAG);
9697 if (Val.getNode())
9698 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
9699 }
9700 }
9701
9702 // If we need to insert a small number of different non-constant elements and
9703 // the vector width is sufficiently large, prefer using DUP with the common
9704 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
9705 // skip the constant lane handling below.
9706 bool PreferDUPAndInsert =
9707 !isConstant && NumDifferentLanes >= 1 &&
9710
9711 // If there was only one constant value used and for more than one lane,
9712 // start by splatting that value, then replace the non-constant lanes. This
9713 // is better than the default, which will perform a separate initialization
9714 // for each lane.
9716 // Firstly, try to materialize the splat constant.
9717 SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
9718 Val = ConstantBuildVector(Vec, DAG);
9719 if (!Val) {
9720 // Otherwise, materialize the constant and splat it.
9721 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
9722 DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
9723 }
9724
9725 // Now insert the non-constant lanes.
9726 for (unsigned i = 0; i < NumElts; ++i) {
9727 SDValue V = Op.getOperand(i);
9728 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
9730 // Note that type legalization likely mucked about with the VT of the
9731 // source operand, so we may have to convert it here before inserting.
9732 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
9733 }
9734 return Val;
9735 }
9736
9737 // This will generate a load from the constant pool.
9738 if (isConstant) {
9739 LLVM_DEBUG(
9740 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
9741 "expansion\n");
9742 return SDValue();
9743 }
9744
9745 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
9746 if (NumElts >= 4) {
9747 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
9748 return shuffle;
9749 }
9750
9751 if (PreferDUPAndInsert) {
9752 // First, build a constant vector with the common element.
9754 for (unsigned I = 0; I < NumElts; ++I)
9755 Ops.push_back(Value);
9756 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
9757 // Next, insert the elements that do not match the common value.
9758 for (unsigned I = 0; I < NumElts; ++I)
9759 if (Op.getOperand(I) != Value)
9760 NewVector =
9762 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
9763
9764 return NewVector;
9765 }
9766
9767 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
9768 // know the default expansion would otherwise fall back on something even
9769 // worse. For a vector with one or two non-undef values, that's
9770 // scalar_to_vector for the elements followed by a shuffle (provided the
9771 // shuffle is valid for the target) and materialization element by element
9772 // on the stack followed by a load for everything else.
9773 if (!isConstant && !usesOnlyOneValue) {
9774 LLVM_DEBUG(
9775 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
9776 "of INSERT_VECTOR_ELT\n");
9777
9778 SDValue Vec = DAG.getUNDEF(VT);
9779 SDValue Op0 = Op.getOperand(0);
9780 unsigned i = 0;
9781
9782 // Use SCALAR_TO_VECTOR for lane zero to
9783 // a) Avoid a RMW dependency on the full vector register, and
9784 // b) Allow the register coalescer to fold away the copy if the
9785 // value is already in an S or D register, and we're forced to emit an
9786 // INSERT_SUBREG that we can't fold anywhere.
9787 //
9788 // We also allow types like i8 and i16 which are illegal scalar but legal
9789 // vector element types. After type-legalization the inserted value is
9790 // extended (i32) and it is safe to cast them to the vector type by ignoring
9791 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
9792 if (!Op0.isUndef()) {
9793 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
9794 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
9795 ++i;
9796 }
9797 LLVM_DEBUG(if (i < NumElts) dbgs()
9798 << "Creating nodes for the other vector elements:\n";);
9799 for (; i < NumElts; ++i) {
9800 SDValue V = Op.getOperand(i);
9801 if (V.isUndef())
9802 continue;
9803 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
9804 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
9805 }
9806 return Vec;
9807 }
9808
9809 LLVM_DEBUG(
9810 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
9811 "better alternative\n");
9812 return SDValue();
9813}
9814
9815SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
9816 SelectionDAG &DAG) const {
9817 assert(Op.getValueType().isScalableVector() &&
9818 isTypeLegal(Op.getValueType()) &&
9819 "Expected legal scalable vector type!");
9820
9821 if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2)
9822 return Op;
9823
9824 return SDValue();
9825}
9826
9827SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
9828 SelectionDAG &DAG) const {
9829 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
9830
9831 // Check for non-constant or out of range lane.
9832 EVT VT = Op.getOperand(0).getValueType();
9833 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
9834 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
9835 return SDValue();
9836
9837
9838 // Insertion/extraction are legal for V128 types.
9839 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
9840 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
9841 VT == MVT::v8f16 || VT == MVT::v8bf16)
9842 return Op;
9843
9844 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
9845 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
9846 VT != MVT::v4bf16)
9847 return SDValue();
9848
9849 // For V64 types, we perform insertion by expanding the value
9850 // to a V128 type and perform the insertion on that.
9851 SDLoc DL(Op);
9852 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
9853 EVT WideTy = WideVec.getValueType();
9854
9856 Op.getOperand(1), Op.getOperand(2));
9857 // Re-narrow the resultant vector.
9858 return NarrowVector(Node, DAG);
9859}
9860
9861SDValue
9862AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
9863 SelectionDAG &DAG) const {
9864 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
9865
9866 // Check for non-constant or out of range lane.
9867 EVT VT = Op.getOperand(0).getValueType();
9868 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
9869 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
9870 return SDValue();
9871
9872
9873 // Insertion/extraction are legal for V128 types.
9874 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
9875 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
9876 VT == MVT::v8f16 || VT == MVT::v8bf16)
9877 return Op;
9878
9879 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
9880 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
9881 VT != MVT::v4bf16)
9882 return SDValue();
9883
9884 // For V64 types, we perform extraction by expanding the value
9885 // to a V128 type and perform the extraction on that.
9886 SDLoc DL(Op);
9887 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
9888 EVT WideTy = WideVec.getValueType();
9889
9890 EVT ExtrTy = WideTy.getVectorElementType();
9891 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
9892 ExtrTy = MVT::i32;
9893
9894 // For extractions, we just return the result directly.
9896 Op.getOperand(1));
9897}
9898
9899SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
9900 SelectionDAG &DAG) const {
9901 assert(Op.getValueType().isFixedLengthVector() &&
9902 "Only cases that extract a fixed length vector are supported!");
9903
9904 EVT InVT = Op.getOperand(0).getValueType();
9905 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
9906 unsigned Size = Op.getValueSizeInBits();
9907
9908 if (InVT.isScalableVector()) {
9909 // This will be matched by custom code during ISelDAGToDAG.
9910 if (Idx == 0 && isPackedVectorType(InVT, DAG))
9911 return Op;
9912
9913 return SDValue();
9914 }
9915
9916 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
9917 if (Idx == 0 && InVT.getSizeInBits() <= 128)
9918 return Op;
9919
9920 // If this is extracting the upper 64-bits of a 128-bit vector, we match
9921 // that directly.
9922 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
9923 InVT.getSizeInBits() == 128)
9924 return Op;
9925
9926 return SDValue();
9927}
9928
9929SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
9930 SelectionDAG &DAG) const {
9931 assert(Op.getValueType().isScalableVector() &&
9932 "Only expect to lower inserts into scalable vectors!");
9933
9934 EVT InVT = Op.getOperand(1).getValueType();
9935 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
9936
9937 if (InVT.isScalableVector()) {
9938 SDLoc DL(Op);
9939 EVT VT = Op.getValueType();
9940
9941 if (!isTypeLegal(VT) || !VT.isInteger())
9942 return SDValue();
9943
9944 SDValue Vec0 = Op.getOperand(0);
9945 SDValue Vec1 = Op.getOperand(1);
9946
9947 // Ensure the subvector is half the size of the main vector.
9948 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
9949 return SDValue();
9950
9951 // Extend elements of smaller vector...
9952 EVT WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
9954
9955 if (Idx == 0) {
9957 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0);
9958 } else if (Idx == InVT.getVectorMinNumElements()) {
9960 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec);
9961 }
9962
9963 return SDValue();
9964 }
9965
9966 // This will be matched by custom code during ISelDAGToDAG.
9967 if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
9968 return Op;
9969
9970 return SDValue();
9971}
9972
9973SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
9974 EVT VT = Op.getValueType();
9975
9976 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
9977 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
9978
9979 assert(VT.isScalableVector() && "Expected a scalable vector.");
9980
9981 bool Signed = Op.getOpcode() == ISD::SDIV;
9983
9984 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
9985 return LowerToPredicatedOp(Op, DAG, PredOpcode);
9986
9987 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
9988 // operations, and truncate the result.
9989 EVT WidenedVT;
9990 if (VT == MVT::nxv16i8)
9992 else if (VT == MVT::nxv8i16)
9994 else
9995 llvm_unreachable("Unexpected Custom DIV operation");
9996
9997 SDLoc dl(Op);
10000 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
10001 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
10002 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
10003 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
10004 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
10005 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
10006 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
10007}
10008
10010 // Currently no fixed length shuffles that require SVE are legal.
10011 if (useSVEForFixedLengthVectorVT(VT))
10012 return false;
10013
10014 if (VT.getVectorNumElements() == 4 &&
10015 (VT.is128BitVector() || VT.is64BitVector())) {
10016 unsigned PFIndexes[4];
10017 for (unsigned i = 0; i != 4; ++i) {
10018 if (M[i] < 0)
10019 PFIndexes[i] = 8;
10020 else
10021 PFIndexes[i] = M[i];
10022 }
10023
10024 // Compute the index in the perfect shuffle table.
10025 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
10026 PFIndexes[2] * 9 + PFIndexes[3];
10028 unsigned Cost = (PFEntry >> 30);
10029
10030 if (Cost <= 4)
10031 return true;
10032 }
10033
10034 bool DummyBool;
10035 int DummyInt;
10036 unsigned DummyUnsigned;
10037
10038 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
10039 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
10041 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
10042 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
10043 isZIPMask(M, VT, DummyUnsigned) ||
10048 isConcatMask(M, VT, VT.getSizeInBits() == 128));
10049}
10050
10051/// getVShiftImm - Check if this is a valid build_vector for the immediate
10052/// operand of a vector shift operation, where all the elements of the
10053/// build_vector must have the same constant integer value.
10054static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
10055 // Ignore bit_converts.
10056 while (Op.getOpcode() == ISD::BITCAST)
10057 Op = Op.getOperand(0);
10059 APInt SplatBits, SplatUndef;
10060 unsigned SplatBitSize;
10061 bool HasAnyUndefs;
10062 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
10064 SplatBitSize > ElementBits)
10065 return false;
10066 Cnt = SplatBits.getSExtValue();
10067 return true;
10068}
10069
10070/// isVShiftLImm - Check if this is a valid build_vector for the immediate
10071/// operand of a vector shift left operation. That value must be in the range:
10072/// 0 <= Value < ElementBits for a left shift; or
10073/// 0 <= Value <= ElementBits for a long left shift.
10074static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
10075 assert(VT.isVector() && "vector shift count is not a vector type");
10076 int64_t ElementBits = VT.getScalarSizeInBits();
10077 if (!getVShiftImm(Op, ElementBits, Cnt))
10078 return false;
10079 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
10080}
10081
10082/// isVShiftRImm - Check if this is a valid build_vector for the immediate
10083/// operand of a vector shift right operation. The value must be in the range:
10084/// 1 <= Value <= ElementBits for a right shift; or
10085static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
10086 assert(VT.isVector() && "vector shift count is not a vector type");
10087 int64_t ElementBits = VT.getScalarSizeInBits();
10088 if (!getVShiftImm(Op, ElementBits, Cnt))
10089 return false;
10090 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
10091}
10092
10093SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
10094 SelectionDAG &DAG) const {
10095 EVT VT = Op.getValueType();
10096
10097 if (VT.getScalarType() == MVT::i1) {
10098 // Lower i1 truncate to `(x & 1) != 0`.
10099 SDLoc dl(Op);
10100 EVT OpVT = Op.getOperand(0).getValueType();
10101 SDValue Zero = DAG.getConstant(0, dl, OpVT);
10102 SDValue One = DAG.getConstant(1, dl, OpVT);
10103 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
10104 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
10105 }
10106
10107 if (!VT.isVector() || VT.isScalableVector())
10108 return SDValue();
10109
10110 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
10111 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
10112
10113 return SDValue();
10114}
10115
10116SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
10117 SelectionDAG &DAG) const {
10118 EVT VT = Op.getValueType();
10119 SDLoc DL(Op);
10120 int64_t Cnt;
10121
10122 if (!Op.getOperand(1).getValueType().isVector())
10123 return Op;
10124 unsigned EltSize = VT.getScalarSizeInBits();
10125
10126 switch (Op.getOpcode()) {
10127 default:
10128 llvm_unreachable("unexpected shift opcode");
10129
10130 case ISD::SHL:
10131 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
10132 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
10133
10134 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
10135 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
10136 DAG.getConstant(Cnt, DL, MVT::i32));
10137 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10138 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
10139 MVT::i32),
10140 Op.getOperand(0), Op.getOperand(1));
10141 case ISD::SRA:
10142 case ISD::SRL:
10143 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) {
10144 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
10146 return LowerToPredicatedOp(Op, DAG, Opc);
10147 }
10148
10149 // Right shift immediate
10150 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
10151 unsigned Opc =
10152 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
10153 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
10154 DAG.getConstant(Cnt, DL, MVT::i32));
10155 }
10156
10157 // Right shift register. Note, there is not a shift right register
10158 // instruction, but the shift left register instruction takes a signed
10159 // value, where negative numbers specify a right shift.
10160 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
10162 // negate the shift amount
10163 SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
10166 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
10167 NegShift);
10168 return NegShiftLeft;
10169 }
10170
10171 return SDValue();
10172}
10173
10175 AArch64CC::CondCode CC, bool NoNans, EVT VT,
10176 const SDLoc &dl, SelectionDAG &DAG) {
10177 EVT SrcVT = LHS.getValueType();
10178 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
10179 "function only supposed to emit natural comparisons");
10180
10182 APInt CnstBits(VT.getSizeInBits(), 0);
10183 APInt UndefBits(VT.getSizeInBits(), 0);
10185 bool IsZero = IsCnst && (CnstBits == 0);
10186
10187 if (SrcVT.getVectorElementType().isFloatingPoint()) {
10188 switch (CC) {
10189 default:
10190 return SDValue();
10191 case AArch64CC::NE: {
10192 SDValue Fcmeq;
10193 if (IsZero)
10194 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
10195 else
10196 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
10197 return DAG.getNOT(dl, Fcmeq, VT);
10198 }
10199 case AArch64CC::EQ:
10200 if (IsZero)
10201 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
10202 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
10203 case AArch64CC::GE:
10204 if (IsZero)
10205 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
10206 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
10207 case AArch64CC::GT:
10208 if (IsZero)
10209 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
10210 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
10211 case AArch64CC::LS:
10212 if (IsZero)
10213 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
10214 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
10215 case AArch64CC::LT:
10216 if (!NoNans)
10217 return SDValue();
10218 // If we ignore NaNs then we can use to the MI implementation.
10220 case AArch64CC::MI:
10221 if (IsZero)
10222 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
10223 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
10224 }
10225 }
10226
10227 switch (CC) {
10228 default:
10229 return SDValue();
10230 case AArch64CC::NE: {
10231 SDValue Cmeq;
10232 if (IsZero)
10233 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
10234 else
10235 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
10236 return DAG.getNOT(dl, Cmeq, VT);
10237 }
10238 case AArch64CC::EQ:
10239 if (IsZero)
10240 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
10241 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
10242 case AArch64CC::GE:
10243 if (IsZero)
10244 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
10245 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
10246 case AArch64CC::GT:
10247 if (IsZero)
10248 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
10249 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
10250 case AArch64CC::LE:
10251 if (IsZero)
10252 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
10253 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
10254 case AArch64CC::LS:
10255 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
10256 case AArch64CC::LO:
10257 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
10258 case AArch64CC::LT:
10259 if (IsZero)
10260 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
10261 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
10262 case AArch64CC::HI:
10263 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
10264 case AArch64CC::HS:
10265 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
10266 }
10267}
10268
10269SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
10270 SelectionDAG &DAG) const {
10271 if (Op.getValueType().isScalableVector()) {
10272 if (Op.getOperand(0).getValueType().isFloatingPoint())
10273 return Op;
10274 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
10275 }
10276
10277 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
10278 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
10279
10280 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
10281 SDValue LHS = Op.getOperand(0);
10282 SDValue RHS = Op.getOperand(1);
10283 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
10284 SDLoc dl(Op);
10285
10286 if (LHS.getValueType().getVectorElementType().isInteger()) {
10287 assert(LHS.getValueType() == RHS.getValueType());
10289 SDValue Cmp =
10290 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
10291 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
10292 }
10293
10294 const bool FullFP16 =
10295 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
10296
10297 // Make v4f16 (only) fcmp operations utilise vector instructions
10298 // v8f16 support will be a litle more complicated
10299 if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
10300 if (LHS.getValueType().getVectorNumElements() == 4) {
10301 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
10302 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
10303 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
10305 CmpVT = MVT::v4i32;
10306 } else
10307 return SDValue();
10308 }
10309
10310 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
10311 LHS.getValueType().getVectorElementType() != MVT::f128);
10312
10313 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10314 // clean. Some of them require two branches to implement.
10316 bool ShouldInvert;
10318
10320 SDValue Cmp =
10321 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
10322 if (!Cmp.getNode())
10323 return SDValue();
10324
10325 if (CC2 != AArch64CC::AL) {
10326 SDValue Cmp2 =
10327 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
10328 if (!Cmp2.getNode())
10329 return SDValue();
10330
10331 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
10332 }
10333
10334 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
10335
10336 if (ShouldInvert)
10337 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
10338
10339 return Cmp;
10340}
10341
10343 SelectionDAG &DAG) {
10344 SDValue VecOp = ScalarOp.getOperand(0);
10345 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
10346 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
10347 DAG.getConstant(0, DL, MVT::i64));
10348}
10349
10350SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
10351 SelectionDAG &DAG) const {
10352 SDValue Src = Op.getOperand(0);
10353
10354 // Try to lower fixed length reductions to SVE.
10355 EVT SrcVT = Src.getValueType();
10356 bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND ||
10357 Op.getOpcode() == ISD::VECREDUCE_OR ||
10358 Op.getOpcode() == ISD::VECREDUCE_XOR ||
10359 Op.getOpcode() == ISD::VECREDUCE_FADD ||
10360 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
10361 SrcVT.getVectorElementType() == MVT::i64);
10362 if (SrcVT.isScalableVector() ||
10363 useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) {
10364
10365 if (SrcVT.getVectorElementType() == MVT::i1)
10366 return LowerPredReductionToSVE(Op, DAG);
10367
10368 switch (Op.getOpcode()) {
10369 case ISD::VECREDUCE_ADD:
10370 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
10371 case ISD::VECREDUCE_AND:
10372 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
10373 case ISD::VECREDUCE_OR:
10374 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
10376 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
10378 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
10380 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
10382 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
10383 case ISD::VECREDUCE_XOR:
10384 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
10386 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
10388 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
10390 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
10391 default:
10392 llvm_unreachable("Unhandled fixed length reduction");
10393 }
10394 }
10395
10396 // Lower NEON reductions.
10397 SDLoc dl(Op);
10398 switch (Op.getOpcode()) {
10399 case ISD::VECREDUCE_ADD:
10400 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
10402 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
10404 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
10406 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
10408 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
10409 case ISD::VECREDUCE_FMAX: {
10410 return DAG.getNode(
10411 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
10412 DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
10413 Src);
10414 }
10415 case ISD::VECREDUCE_FMIN: {
10416 return DAG.getNode(
10417 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
10418 DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
10419 Src);
10420 }
10421 default:
10422 llvm_unreachable("Unhandled reduction");
10423 }
10424}
10425
10426SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
10427 SelectionDAG &DAG) const {
10428 auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
10429 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
10430 return SDValue();
10431
10432 // LSE has an atomic load-add instruction, but not a load-sub.
10433 SDLoc dl(Op);
10434 MVT VT = Op.getSimpleValueType();
10435 SDValue RHS = Op.getOperand(2);
10436 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
10437 RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
10438 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
10439 Op.getOperand(0), Op.getOperand(1), RHS,
10440 AN->getMemOperand());
10441}
10442
10443SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
10444 SelectionDAG &DAG) const {
10445 auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
10446 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
10447 return SDValue();
10448
10449 // LSE has an atomic load-clear instruction, but not a load-and.
10450 SDLoc dl(Op);
10451 MVT VT = Op.getSimpleValueType();
10452 SDValue RHS = Op.getOperand(2);
10453 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
10454 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
10455 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
10456 Op.getOperand(0), Op.getOperand(1), RHS,
10457 AN->getMemOperand());
10458}
10459
10460SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
10461 SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
10462 SDLoc dl(Op);
10464 SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);
10465
10466 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10467 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
10468 if (Subtarget->hasCustomCallingConv())
10469 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10470
10471 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
10472 DAG.getConstant(4, dl, MVT::i64));
10473 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
10474 Chain =
10476 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
10477 DAG.getRegisterMask(Mask), Chain.getValue(1));
10478 // To match the actual intent better, we should read the output from X15 here
10479 // again (instead of potentially spilling it to the stack), but rereading Size
10480 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
10481 // here.
10482
10483 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
10484 DAG.getConstant(4, dl, MVT::i64));
10485 return Chain;
10486}
10487
10488SDValue
10489AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
10490 SelectionDAG &DAG) const {
10491 assert(Subtarget->isTargetWindows() &&
10492 "Only Windows alloca probing supported");
10493 SDLoc dl(Op);
10494 // Get the inputs.
10495 SDNode *Node = Op.getNode();
10496 SDValue Chain = Op.getOperand(0);
10497 SDValue Size = Op.getOperand(1);
10499 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
10500 EVT VT = Node->getValueType(0);
10501
10503 "no-stack-arg-probe")) {
10504 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
10505 Chain = SP.getValue(1);
10506 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
10507 if (Align)
10508 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
10509 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
10510 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
10511 SDValue Ops[2] = {SP, Chain};
10512 return DAG.getMergeValues(Ops, dl);
10513 }
10514
10515 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
10516
10517 Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
10518
10519 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
10520 Chain = SP.getValue(1);
10521 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
10522 if (Align)
10523 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
10524 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
10525 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
10526
10527 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
10528 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
10529
10530 SDValue Ops[2] = {SP, Chain};
10531 return DAG.getMergeValues(Ops, dl);
10532}
10533
10534SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
10535 SelectionDAG &DAG) const {
10536 EVT VT = Op.getValueType();
10537 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
10538
10539 SDLoc DL(Op);
10540 APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
10541 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)),
10542 DL, VT);
10543}
10544
10545/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
10546template <unsigned NumVecs>
10547static bool
10550 Info.opc = ISD::INTRINSIC_VOID;
10551 // Retrieve EC from first vector argument.
10552 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
10554#ifndef NDEBUG
10555 // Check the assumption that all input vectors are the same type.
10556 for (unsigned I = 0; I < NumVecs; ++I)
10557 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
10558 "Invalid type.");
10559#endif
10560 // memVT is `NumVecs * VT`.
10561 Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
10562 EC * NumVecs);
10563 Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1);
10564 Info.offset = 0;
10565 Info.align.reset();
10566 Info.flags = MachineMemOperand::MOStore;
10567 return true;
10568}
10569
10570/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
10571/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
10572/// specified in the intrinsic calls.
10574 const CallInst &I,
10575 MachineFunction &MF,
10576 unsigned Intrinsic) const {
10577 auto &DL = I.getModule()->getDataLayout();
10578 switch (Intrinsic) {
10579 case Intrinsic::aarch64_sve_st2:
10580 return setInfoSVEStN<2>(*this, DL, Info, I);
10581 case Intrinsic::aarch64_sve_st3:
10582 return setInfoSVEStN<3>(*this, DL, Info, I);
10583 case Intrinsic::aarch64_sve_st4:
10584 return setInfoSVEStN<4>(*this, DL, Info, I);
10585 case Intrinsic::aarch64_neon_ld2:
10586 case Intrinsic::aarch64_neon_ld3:
10587 case Intrinsic::aarch64_neon_ld4:
10588 case Intrinsic::aarch64_neon_ld1x2:
10589 case Intrinsic::aarch64_neon_ld1x3:
10590 case Intrinsic::aarch64_neon_ld1x4:
10591 case Intrinsic::aarch64_neon_ld2lane:
10592 case Intrinsic::aarch64_neon_ld3lane:
10593 case Intrinsic::aarch64_neon_ld4lane:
10594 case Intrinsic::aarch64_neon_ld2r:
10595 case Intrinsic::aarch64_neon_ld3r:
10596 case Intrinsic::aarch64_neon_ld4r: {
10597 Info.opc = ISD::INTRINSIC_W_CHAIN;
10598 // Conservatively set memVT to the entire set of vectors loaded.
10599 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
10600 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
10601 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
10602 Info.offset = 0;
10603 Info.align.reset();
10604 // volatile loads with NEON intrinsics not supported
10605 Info.flags = MachineMemOperand::MOLoad;
10606 return true;
10607 }
10608 case Intrinsic::aarch64_neon_st2:
10609 case Intrinsic::aarch64_neon_st3:
10610 case Intrinsic::aarch64_neon_st4:
10611 case Intrinsic::aarch64_neon_st1x2:
10612 case Intrinsic::aarch64_neon_st1x3:
10613 case Intrinsic::aarch64_neon_st1x4:
10614 case Intrinsic::aarch64_neon_st2lane:
10615 case Intrinsic::aarch64_neon_st3lane:
10616 case Intrinsic::aarch64_neon_st4lane: {
10617 Info.opc = ISD::INTRINSIC_VOID;
10618 // Conservatively set memVT to the entire set of vectors stored.
10619 unsigned NumElts = 0;
10620 for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
10621 Type *ArgTy = I.getArgOperand(ArgI)->getType();
10622 if (!ArgTy->isVectorTy())
10623 break;
10624 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
10625 }
10626 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
10627 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
10628 Info.offset = 0;
10629 Info.align.reset();
10630 // volatile stores with NEON intrinsics not supported
10631 Info.flags = MachineMemOperand::MOStore;
10632 return true;
10633 }
10634 case Intrinsic::aarch64_ldaxr:
10635 case Intrinsic::aarch64_ldxr: {
10636 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
10637 Info.opc = ISD::INTRINSIC_W_CHAIN;
10638 Info.memVT = MVT::getVT(PtrTy->getElementType());
10639 Info.ptrVal = I.getArgOperand(0);
10640 Info.offset = 0;
10641 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
10643 return true;
10644 }
10645 case Intrinsic::aarch64_stlxr:
10646 case Intrinsic::aarch64_stxr: {
10647 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
10648 Info.opc = ISD::INTRINSIC_W_CHAIN;
10649 Info.memVT = MVT::getVT(PtrTy->getElementType());
10650 Info.ptrVal = I.getArgOperand(1);
10651 Info.offset = 0;
10652 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
10654 return true;
10655 }
10656 case Intrinsic::aarch64_ldaxp:
10657 case Intrinsic::aarch64_ldxp:
10658 Info.opc = ISD::INTRINSIC_W_CHAIN;
10659 Info.memVT = MVT::i128;
10660 Info.ptrVal = I.getArgOperand(0);
10661 Info.offset = 0;
10662 Info.align = Align(16);
10664 return true;
10665 case Intrinsic::aarch64_stlxp:
10666 case Intrinsic::aarch64_stxp:
10667 Info.opc = ISD::INTRINSIC_W_CHAIN;
10668 Info.memVT = MVT::i128;
10669 Info.ptrVal = I.getArgOperand(2);
10670 Info.offset = 0;
10671 Info.align = Align(16);
10673 return true;
10674 case Intrinsic::aarch64_sve_ldnt1: {
10675 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
10676 Info.opc = ISD::INTRINSIC_W_CHAIN;
10677 Info.memVT = MVT::getVT(I.getType());
10678 Info.ptrVal = I.getArgOperand(1);
10679 Info.offset = 0;
10680 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
10681 Info.flags = MachineMemOperand::MOLoad;
10682 if (Intrinsic == Intrinsic::aarch64_sve_ldnt1)
10684 return true;
10685 }
10686 case Intrinsic::aarch64_sve_stnt1: {
10687 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType());
10688 Info.opc = ISD::INTRINSIC_W_CHAIN;
10689 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
10690 Info.ptrVal = I.getArgOperand(2);
10691 Info.offset = 0;
10692 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
10693 Info.flags = MachineMemOperand::MOStore;
10694 if (Intrinsic == Intrinsic::aarch64_sve_stnt1)
10696 return true;
10697 }
10698 default:
10699 break;
10700 }
10701
10702 return false;
10703}
10704
10706 ISD::LoadExtType ExtTy,
10707 EVT NewVT) const {
10708 // TODO: This may be worth removing. Check regression tests for diffs.
10710 return false;
10711
10712 // If we're reducing the load width in order to avoid having to use an extra
10713 // instruction to do extension then it's probably a good idea.
10714 if (ExtTy != ISD::NON_EXTLOAD)
10715 return true;
10716 // Don't reduce load width if it would prevent us from combining a shift into
10717 // the offset.
10718 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
10719 assert(Mem);
10720 const SDValue &Base = Mem->getBasePtr();
10721 if (Base.getOpcode() == ISD::ADD &&
10722 Base.getOperand(1).getOpcode() == ISD::SHL &&
10723 Base.getOperand(1).hasOneUse() &&
10724 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
10725 // The shift can be combined if it matches the size of the value being
10726 // loaded (and so reducing the width would make it not match).
10727 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
10728 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
10729 if (ShiftAmount == Log2_32(LoadBytes))
10730 return false;
10731 }
10732 // We have no reason to disallow reducing the load width, so allow it.
10733 return true;
10734}
10735
10736// Truncations from 64-bit GPR to 32-bit GPR is free.
10738 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
10739 return false;
10740 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedSize();
10741 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedSize();
10742 return NumBits1 > NumBits2;
10743}
10745 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
10746 return false;
10747 uint64_t NumBits1 = VT1.getFixedSizeInBits();
10748 uint64_t NumBits2 = VT2.getFixedSizeInBits();
10749 return NumBits1 > NumBits2;
10750}
10751
10752/// Check if it is profitable to hoist instruction in then/else to if.
10753/// Not profitable if I and it's user can form a FMA instruction
10754/// because we prefer FMSUB/FMADD.
10756 if (I->getOpcode() != Instruction::FMul)
10757 return true;
10758
10759 if (!I->hasOneUse())
10760 return true;
10761
10763
10764 if (User &&
10765 !(User->getOpcode() == Instruction::FSub ||
10766 User->getOpcode() == Instruction::FAdd))
10767 return true;
10768
10769 const TargetOptions &Options = getTargetMachine().Options;
10770 const Function *F = I->getFunction();
10771 const DataLayout &DL = F->getParent()->getDataLayout();
10772 Type *Ty = User->getOperand(0)->getType();
10773
10774 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
10776 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
10777 Options.UnsafeFPMath));
10778}
10779
10780// All 32-bit GPR operations implicitly zero the high-half of the corresponding
10781// 64-bit GPR.
10783 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
10784 return false;
10785 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
10786 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
10787 return NumBits1 == 32 && NumBits2 == 64;
10788}
10790 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
10791 return false;
10792 unsigned NumBits1 = VT1.getSizeInBits();
10793 unsigned NumBits2 = VT2.getSizeInBits();
10794 return NumBits1 == 32 && NumBits2 == 64;
10795}
10796
10798 EVT VT1 = Val.getValueType();
10799 if (isZExtFree(VT1, VT2)) {
10800 return true;
10801 }
10802
10803 if (Val.getOpcode() != ISD::LOAD)
10804 return false;
10805
10806 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
10807 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
10808 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
10809 VT1.getSizeInBits() <= 32);
10810}
10811
10812bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
10813 if (isa<FPExtInst>(Ext))
10814 return false;
10815
10816 // Vector types are not free.
10817 if (Ext->getType()->isVectorTy())
10818 return false;
10819
10820 for (const Use &U : Ext->uses()) {
10821 // The extension is free if we can fold it with a left shift in an
10822 // addressing mode or an arithmetic operation: add, sub, and cmp.
10823
10824 // Is there a shift?
10825 const Instruction *Instr = cast<Instruction>(U.getUser());
10826
10827 // Is this a constant shift?
10828 switch (Instr->getOpcode()) {
10829 case Instruction::Shl:
10830 if (!isa<ConstantInt>(Instr->getOperand(1)))
10831 return false;
10832 break;
10835 auto &DL = Ext->getModule()->getDataLayout();
10836 std::advance(GTI, U.getOperandNo()-1);
10837 Type *IdxTy = GTI.getIndexedType();
10838 // This extension will end up with a shift because of the scaling factor.
10839 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
10840 // Get the shift amount based on the scaling factor:
10841 // log2(sizeof(IdxTy)) - log2(8).
10842 uint64_t ShiftAmt =
10843 countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3;
10844 // Is the constant foldable in the shift of the addressing mode?
10845 // I.e., shift amount is between 1 and 4 inclusive.
10846 if (ShiftAmt == 0 || ShiftAmt > 4)
10847 return false;
10848 break;
10849 }
10850 case Instruction::Trunc:
10851 // Check if this is a noop.
10852 // trunc(sext ty1 to ty2) to ty1.
10853 if (Instr->getType() == Ext->getOperand(0)->getType())
10854 continue;
10856 default:
10857 return false;
10858 }
10859
10860 // At this point we can use the bfm family, so this extension is free
10861 // for that use.
10862 }
10863 return true;
10864}
10865
10866/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
10867/// or upper half of the vector elements.
10868static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
10869 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
10870 auto *FullTy = FullV->getType();
10871 auto *HalfTy = HalfV->getType();
10872 return FullTy->getPrimitiveSizeInBits().getFixedSize() ==
10873 2 * HalfTy->getPrimitiveSizeInBits().getFixedSize();
10874 };
10875
10876 auto extractHalf = [](Value *FullV, Value *HalfV) {
10877 auto *FullVT = cast<FixedVectorType>(FullV->getType());
10878 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
10879 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
10880 };
10881
10883 Value *S1Op1, *S2Op1;
10884 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
10886 return false;
10887
10888 // Check that the operands are half as wide as the result and we extract
10889 // half of the elements of the input vectors.
10890 if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) ||
10891 !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2))
10892 return false;
10893
10894 // Check the mask extracts either the lower or upper half of vector
10895 // elements.
10896 int M1Start = -1;
10897 int M2Start = -1;
10898 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
10901 M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
10902 return false;
10903
10904 return true;
10905}
10906
10907/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
10908/// of the vector elements.
10910 auto areExtDoubled = [](Instruction *Ext) {
10911 return Ext->getType()->getScalarSizeInBits() ==
10912 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
10913 };
10914
10915 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
10919 return false;
10920
10921 return true;
10922}
10923
10924/// Check if Op could be used with vmull_high_p64 intrinsic.
10926 Value *VectorOperand = nullptr;
10927 ConstantInt *ElementIndex = nullptr;
10929 m_ConstantInt(ElementIndex))) &&
10930 ElementIndex->getValue() == 1 &&
10931 isa<FixedVectorType>(VectorOperand->getType()) &&
10932 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
10933}
10934
10935/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
10936static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
10938}
10939
10940/// Check if sinking \p I's operands to I's basic block is profitable, because
10941/// the operands can be folded into a target instruction, e.g.
10942/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
10944 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
10945 if (!I->getType()->isVectorTy())
10946 return false;
10947
10949 switch (II->getIntrinsicID()) {
10950 case Intrinsic::aarch64_neon_umull:
10951 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
10952 return false;
10953 Ops.push_back(&II->getOperandUse(0));
10954 Ops.push_back(&II->getOperandUse(1));
10955 return true;
10956
10957 case Intrinsic::aarch64_neon_pmull64:
10958 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
10959 II->getArgOperand(1)))
10960 return false;
10961 Ops.push_back(&II->getArgOperandUse(0));
10962 Ops.push_back(&II->getArgOperandUse(1));
10963 return true;
10964
10965 default:
10966 return false;
10967 }
10968 }
10969
10970 switch (I->getOpcode()) {
10971 case Instruction::Sub:
10972 case Instruction::Add: {
10973 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
10974 return false;
10975
10976 // If the exts' operands extract either the lower or upper elements, we
10977 // can sink them too.
10978 auto Ext1 = cast<Instruction>(I->getOperand(0));
10979 auto Ext2 = cast<Instruction>(I->getOperand(1));
10981 Ops.push_back(&Ext1->getOperandUse(0));
10982 Ops.push_back(&Ext2->getOperandUse(0));
10983 }
10984
10985 Ops.push_back(&I->getOperandUse(0));
10986 Ops.push_back(&I->getOperandUse(1));
10987
10988 return true;
10989 }
10990 case Instruction::Mul: {
10991 bool IsProfitable = false;
10992 for (auto &Op : I->operands()) {
10993 // Make sure we are not already sinking this operand
10994 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
10995 continue;
10996
10998 if (!Shuffle || !Shuffle->isZeroEltSplat())
10999 continue;
11000
11001 Value *ShuffleOperand = Shuffle->getOperand(0);
11003 if (!Insert)
11004 continue;
11005
11006 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
11007 if (!OperandInstr)
11008 continue;
11009
11011 dyn_cast<ConstantInt>(Insert->getOperand(2));
11012 // Check that the insertelement is inserting into element 0
11013 if (!ElementConstant || ElementConstant->getZExtValue() != 0)
11014 continue;
11015
11016 unsigned Opcode = OperandInstr->getOpcode();
11017 if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt)
11018 continue;
11019
11020 Ops.push_back(&Shuffle->getOperandUse(0));
11021 Ops.push_back(&Op);
11022 IsProfitable = true;
11023 }
11024
11025 return IsProfitable;
11026 }
11027 default:
11028 return false;
11029 }
11030 return false;
11031}
11032
11034 Align &RequiredAligment) const {
11035 if (!LoadedType.isSimple() ||
11036 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
11037 return false;
11038 // Cyclone supports unaligned accesses.
11040 unsigned NumBits = LoadedType.getSizeInBits();
11041 return NumBits == 32 || NumBits == 64;
11042}
11043
11044/// A helper function for determining the number of interleaved accesses we
11045/// will generate when lowering accesses of the given type.
11046unsigned
11048 const DataLayout &DL) const {
11049 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
11050}
11051
11054 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
11055 I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
11056 return MOStridedAccess;
11058}
11059
11061 VectorType *VecTy, const DataLayout &DL) const {
11062
11063 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
11064 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
11065
11066 // Ensure the number of vector elements is greater than 1.
11067 if (cast<FixedVectorType>(VecTy)->getNumElements() < 2)
11068 return false;
11069
11070 // Ensure the element type is legal.
11071 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
11072 return false;
11073
11074 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
11075 // 128 will be split into multiple interleaved accesses.
11076 return VecSize == 64 || VecSize % 128 == 0;
11077}
11078
11079/// Lower an interleaved load into a ldN intrinsic.
11080///
11081/// E.g. Lower an interleaved load (Factor = 2):
11082/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
11083/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
11084/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
11085///
11086/// Into:
11087/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
11088/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
11089/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
11092 ArrayRef<unsigned> Indices, unsigned Factor) const {
11093 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
11094 "Invalid interleave factor");
11095 assert(!Shuffles.empty() && "Empty shufflevector input");
11096 assert(Shuffles.size() == Indices.size() &&
11097 "Unmatched number of shufflevectors and indices");
11098
11099 const DataLayout &DL = LI->getModule()->getDataLayout();
11100
11101 VectorType *VTy = Shuffles[0]->getType();
11102
11103 // Skip if we do not have NEON and skip illegal vector types. We can
11104 // "legalize" wide vector types into multiple interleaved accesses as long as
11105 // the vector types are divisible by 128.
11106 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL))
11107 return false;
11108
11109 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL);
11110
11111 auto *FVTy = cast<FixedVectorType>(VTy);
11112
11113 // A pointer vector can not be the return type of the ldN intrinsics. Need to
11114 // load integer vectors first and then convert to pointer vectors.
11115 Type *EltTy = FVTy->getElementType();
11116 if (EltTy->isPointerTy())
11117 FVTy =
11118 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
11119
11120 IRBuilder<> Builder(LI);
11121
11122 // The base address of the load.
11123 Value *BaseAddr = LI->getPointerOperand();
11124
11125 if (NumLoads > 1) {
11126 // If we're going to generate more than one load, reset the sub-vector type
11127 // to something legal.
11128 FVTy = FixedVectorType::get(FVTy->getElementType(),
11129 FVTy->getNumElements() / NumLoads);
11130
11131 // We will compute the pointer operand of each load from the original base
11132 // address using GEPs. Cast the base address to a pointer to the scalar
11133 // element type.
11134 BaseAddr = Builder.CreateBitCast(
11135 BaseAddr,
11136 FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
11137 }
11138
11139 Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace());
11140 Type *Tys[2] = {FVTy, PtrTy};
11141 static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
11142 Intrinsic::aarch64_neon_ld3,
11143 Intrinsic::aarch64_neon_ld4};
11144 Function *LdNFunc =
11145 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
11146
11147 // Holds sub-vectors extracted from the load intrinsic return values. The
11148 // sub-vectors are associated with the shufflevector instructions they will
11149 // replace.
11151
11152 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
11153
11154 // If we're generating more than one load, compute the base address of
11155 // subsequent loads as an offset from the previous.
11156 if (LoadCount > 0)
11157 BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr,
11158 FVTy->getNumElements() * Factor);
11159
11160 CallInst *LdN = Builder.CreateCall(
11161 LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
11162
11163 // Extract and store the sub-vectors returned by the load intrinsic.
11164 for (unsigned i = 0; i < Shuffles.size(); i++) {
11165 ShuffleVectorInst *SVI = Shuffles[i];
11166 unsigned Index = Indices[i];
11167
11168 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
11169
11170 // Convert the integer vector to pointer vector if the element is pointer.
11171 if (EltTy->isPointerTy())
11172 SubVec = Builder.CreateIntToPtr(
11174 FVTy->getNumElements()));
11175 SubVecs[SVI].push_back(SubVec);
11176 }
11177 }
11178
11179 // Replace uses of the shufflevector instructions with the sub-vectors
11180 // returned by the load intrinsic. If a shufflevector instruction is
11181 // associated with more than one sub-vector, those sub-vectors will be
11182 // concatenated into a single wide vector.
11183 for (ShuffleVectorInst *SVI : Shuffles) {
11184 auto &SubVec = SubVecs[SVI];
11185 auto *WideVec =
11186 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
11187 SVI->replaceAllUsesWith(WideVec);
11188 }
11189
11190 return true;
11191}
11192
11193/// Lower an interleaved store into a stN intrinsic.
11194///
11195/// E.g. Lower an interleaved store (Factor = 3):
11196/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
11197/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
11198/// store <12 x i32> %i.vec, <12 x i32>* %ptr
11199///
11200/// Into:
11201/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
11202/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
11203/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
11204/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
11205///
11206/// Note that the new shufflevectors will be removed and we'll only generate one
11207/// st3 instruction in CodeGen.
11208///
11209/// Example for a more general valid mask (Factor 3). Lower:
11210/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
11211/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
11212/// store <12 x i32> %i.vec, <12 x i32>* %ptr
11213///
11214/// Into:
11215/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
11216/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
11217/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
11218/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
11220 ShuffleVectorInst *SVI,
11221 unsigned Factor) const {
11222 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
11223 "Invalid interleave factor");
11224
11225 auto *VecTy = cast<FixedVectorType>(SVI->getType());
11226 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
11227
11228 unsigned LaneLen = VecTy->getNumElements() / Factor;
11229 Type *EltTy = VecTy->getElementType();
11230 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
11231
11232 const DataLayout &DL = SI->getModule()->getDataLayout();
11233
11234 // Skip if we do not have NEON and skip illegal vector types. We can
11235 // "legalize" wide vector types into multiple interleaved accesses as long as
11236 // the vector types are divisible by 128.
11237 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
11238 return false;
11239
11240 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
11241
11242 Value *Op0 = SVI->getOperand(0);
11243 Value *Op1 = SVI->getOperand(1);
11245
11246 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
11247 // vectors to integer vectors.
11248 if (EltTy->isPointerTy()) {
11249 Type *IntTy = DL.getIntPtrType(EltTy);
11250 unsigned NumOpElts =
11251 cast<FixedVectorType>(Op0->getType())->getNumElements();
11252
11253 // Convert to the corresponding integer vector.
11254 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
11255 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
11256 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
11257
11259 }
11260
11261 // The base address of the store.
11262 Value *BaseAddr = SI->getPointerOperand();
11263
11264 if (NumStores > 1) {
11265 // If we're going to generate more than one store, reset the lane length
11266 // and sub-vector type to something legal.
11267 LaneLen /= NumStores;
11268 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
11269
11270 // We will compute the pointer operand of each store from the original base
11271 // address using GEPs. Cast the base address to a pointer to the scalar
11272 // element type.
11273 BaseAddr = Builder.CreateBitCast(
11274 BaseAddr,
11275 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
11276 }
11277
11278 auto Mask = SVI->getShuffleMask();
11279
11280 Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
11281 Type *Tys[2] = {SubVecTy, PtrTy};
11282 static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
11283 Intrinsic::aarch64_neon_st3,
11284 Intrinsic::aarch64_neon_st4};
11285 Function *StNFunc =
11286 Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
11287
11288 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
11289
11291
11292 // Split the shufflevector operands into sub vectors for the new stN call.
11293 for (unsigned i = 0; i < Factor; i++) {
11294 unsigned IdxI = StoreCount * LaneLen * Factor + i;
11295 if (Mask[IdxI] >= 0) {
11296 Ops.push_back(Builder.CreateShuffleVector(
11297 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
11298 } else {
11299 unsigned StartMask = 0;
11300 for (unsigned j = 1; j < LaneLen; j++) {
11301 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
11302 if (Mask[IdxJ * Factor + IdxI] >= 0) {
11303 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
11304 break;
11305 }
11306 }
11307 // Note: Filling undef gaps with random elements is ok, since
11308 // those elements were being written anyway (with undefs).
11309 // In the case of all undefs we're defaulting to using elems from 0
11310 // Note: StartMask cannot be negative, it's checked in
11311 // isReInterleaveMask
11312 Ops.push_back(Builder.CreateShuffleVector(
11313 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
11314 }
11315 }
11316
11317 // If we generating more than one store, we compute the base address of
11318 // subsequent stores as an offset from the previous.
11319 if (StoreCount > 0)
11320 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
11321 BaseAddr, LaneLen * Factor);
11322
11323 Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
11324 Builder.CreateCall(StNFunc, Ops);
11325 }
11326 return true;
11327}
11328
11329// Lower an SVE structured load intrinsic returning a tuple type to target
11330// specific intrinsic taking the same input but returning a multi-result value
11331// of the split tuple type.
11332//
11333// E.g. Lowering an LD3:
11334//
11335// call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
11336// <vscale x 4 x i1> %pred,
11337// <vscale x 4 x i32>* %addr)
11338//
11339// Output DAG:
11340//
11341// t0: ch = EntryToken
11342// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
11343// t4: i64,ch = CopyFromReg t0, Register:i64 %1
11344// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
11345// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
11346//
11347// This is called pre-legalization to avoid widening/splitting issues with
11348// non-power-of-2 tuple types used for LD3, such as nxv12i32.
11349SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
11351 EVT VT, SelectionDAG &DAG,
11352 const SDLoc &DL) const {
11353 assert(VT.isScalableVector() && "Can only lower scalable vectors");
11354
11355 unsigned N, Opcode;
11356 static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
11357 {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
11358 {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
11359 {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
11360
11361 std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
11363 "invalid tuple vector type!");
11364
11365 EVT SplitVT =
11369
11371 VTs.push_back(MVT::Other); // Chain
11372 SDVTList NodeTys = DAG.getVTList(VTs);
11373
11374 SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
11376 for (unsigned I = 0; I < N; ++I)
11377 PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
11378 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
11379}
11380
11382 const MemOp &Op, const AttributeList &FuncAttributes) const {
11383 bool CanImplicitFloat =
11384 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
11385 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
11386 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
11387 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
11388 // taken one instruction to materialize the v2i64 zero and one store (with
11389 // restrictive addressing mode). Just do i64 stores.
11390 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
11391 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
11392 if (Op.isAligned(AlignCheck))
11393 return true;
11394 bool Fast;
11396 &Fast) &&
11397 Fast;
11398 };
11399
11400 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
11402 return MVT::v2i64;
11404 return MVT::f128;
11405 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
11406 return MVT::i64;
11407 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
11408 return MVT::i32;
11409 return MVT::Other;
11410}
11411
11413 const MemOp &Op, const AttributeList &FuncAttributes) const {
11414 bool CanImplicitFloat =
11415 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
11416 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
11417 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
11418 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
11419 // taken one instruction to materialize the v2i64 zero and one store (with
11420 // restrictive addressing mode). Just do i64 stores.
11421 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
11422 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
11423 if (Op.isAligned(AlignCheck))
11424 return true;
11425 bool Fast;
11427 &Fast) &&
11428 Fast;
11429 };
11430
11431 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
11433 return LLT::vector(2, 64);
11435 return LLT::scalar(128);
11436 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
11437 return LLT::scalar(64);
11438 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
11439 return LLT::scalar(32);
11440 return LLT();
11441}
11442
11443// 12-bit optionally shifted immediates are legal for adds.
11445 if (Immed == std::numeric_limits<int64_t>::min()) {
11446 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
11447 << ": avoid UB for INT64_MIN\n");
11448 return false;
11449 }
11450 // Same encoding for add/sub, just flip the sign.
11451 Immed = std::abs(Immed);
11452 bool IsLegal = ((Immed >> 12) == 0 ||
11453 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
11454 LLVM_DEBUG(dbgs() << "Is " << Immed
11455 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
11456 return IsLegal;
11457}
11458
11459// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
11460// immediates is the same as for an add or a sub.
11464
11465/// isLegalAddressingMode - Return true if the addressing mode represented
11466/// by AM is legal for this target, for a load/store of the specified type.
11468 const AddrMode &AM, Type *Ty,
11469 unsigned AS, Instruction *I) const {
11470 // AArch64 has five basic addressing modes:
11471 // reg
11472 // reg + 9-bit signed offset
11473 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
11474 // reg1 + reg2
11475 // reg + SIZE_IN_BYTES * reg
11476
11477 // No global is ever allowed as a base.
11478 if (AM.BaseGV)
11479 return false;
11480
11481 // No reg+reg+imm addressing.
11482 if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
11483 return false;
11484
11485 // FIXME: Update this method to support scalable addressing modes.
11487 return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;
11488
11489 // check reg + imm case:
11490 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
11491 uint64_t NumBytes = 0;
11492 if (Ty->isSized()) {
11493 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
11494 NumBytes = NumBits / 8;
11495 if (!isPowerOf2_64(NumBits))
11496 NumBytes = 0;
11497 }
11498
11499 if (!AM.Scale) {
11500 int64_t Offset = AM.BaseOffs;
11501
11502 // 9-bit signed offset
11503 if (isInt<9>(Offset))
11504 return true;
11505
11506 // 12-bit unsigned offset
11507 unsigned shift = Log2_64(NumBytes);
11508 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11509 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11510 (Offset >> shift) << shift == Offset)
11511 return true;
11512 return false;
11513 }
11514
11515 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11516
11517 return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
11518}
11519
11521 // Consider splitting large offset of struct or array.
11522 return true;
11523}
11524
11526 const AddrMode &AM, Type *Ty,
11527 unsigned AS) const {
11528 // Scaling factors are not free at all.
11529 // Operands | Rt Latency
11530 // -------------------------------------------
11531 // Rt, [Xn, Xm] | 4
11532 // -------------------------------------------
11533 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
11534 // Rt, [Xn, Wm, <extend> #imm] |
11535 if (isLegalAddressingMode(DL, AM, Ty, AS))
11536 // Scale represents reg2 * scale, thus account for 1 if
11537 // it is not equal to 0 or 1.
11538 return AM.Scale != 0 && AM.Scale != 1;
11539 return -1;
11540}
11541
11543 const MachineFunction &MF, EVT VT) const {
11544 VT = VT.getScalarType();
11545
11546 if (!VT.isSimple())
11547 return false;
11548
11549 switch (VT.getSimpleVT().SimpleTy) {
11550 case MVT::f32:
11551 case MVT::f64:
11552 return true;
11553 default:
11554 break;
11555 }
11556
11557 return false;
11558}
11559
11561 Type *Ty) const {
11562 switch (Ty->getScalarType()->getTypeID()) {
11563 case Type::FloatTyID:
11564 case Type::DoubleTyID:
11565 return true;
11566 default:
11567 return false;
11568 }
11569}
11570
11571const MCPhysReg *
11573 // LR is a callee-save register, but we must treat it as clobbered by any call
11574 // site. Hence we include LR in the scratch registers, which are in turn added
11575 // as implicit-defs for stackmaps and patchpoints.
11576 static const MCPhysReg ScratchRegs[] = {
11577 AArch64::X16, AArch64::X17, AArch64::LR, 0
11578 };
11579 return ScratchRegs;
11580}
11581
11582bool
11584 CombineLevel Level) const {
11585 N = N->getOperand(0).getNode();
11586 EVT VT = N->getValueType(0);
11587 // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
11588 // it with shift to let it be lowered to UBFX.
11589 if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
11590 isa<ConstantSDNode>(N->getOperand(1))) {
11591 uint64_t TruncMask = N->getConstantOperandVal(1);
11592 if (isMask_64(TruncMask) &&
11593 N->getOperand(0).getOpcode() == ISD::SRL &&
11594 isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
11595 return false;
11596 }
11597 return true;
11598}
11599
11601 Type *Ty) const {
11602 assert(Ty->isIntegerTy());
11603
11604 unsigned BitSize = Ty->getPrimitiveSizeInBits();
11605 if (BitSize == 0)
11606 return false;
11607
11608 int64_t Val = Imm.getSExtValue();
11609 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
11610 return true;
11611
11612 if ((int64_t)Val < 0)
11613 Val = ~Val;
11614 if (BitSize == 32)
11615 Val &= (1LL << 32) - 1;
11616
11617 unsigned LZ = countLeadingZeros((uint64_t)Val);
11618 unsigned Shift = (63 - LZ) / 16;
11619 // MOVZ is free so return true for one or fewer MOVK.
11620 return Shift < 3;
11621}
11622
11624 unsigned Index) const {
11626 return false;
11627
11628 return (Index == 0 || Index == ResVT.getVectorNumElements());
11629}
11630
11631/// Turn vector tests of the signbit in the form of:
11632/// xor (sra X, elt_size(X)-1), -1
11633/// into:
11634/// cmge X, X, #0
11636 const AArch64Subtarget *Subtarget) {
11637 EVT VT = N->getValueType(0);
11638 if (!Subtarget->hasNEON() || !VT.isVector())
11639 return SDValue();
11640
11641 // There must be a shift right algebraic before the xor, and the xor must be a
11642 // 'not' operation.
11643 SDValue Shift = N->getOperand(0);
11644 SDValue Ones = N->getOperand(1);
11645 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
11646 !ISD::isBuildVectorAllOnes(Ones.getNode()))
11647 return SDValue();
11648
11649 // The shift should be smearing the sign bit across each vector element.
11650 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
11652 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
11653 return SDValue();
11654
11655 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
11656}
11657
11658// VECREDUCE_ADD( EXTEND(v16i8_type) ) to
11659// VECREDUCE_ADD( DOTv16i8(v16i8_type) )
11661 const AArch64Subtarget *ST) {
11662 SDValue Op0 = N->getOperand(0);
11663 if (!ST->hasDotProd() || N->getValueType(0) != MVT::i32)
11664 return SDValue();
11665
11667 return SDValue();
11668
11669 unsigned ExtOpcode = Op0.getOpcode();
11671 return SDValue();
11672
11673 EVT Op0VT = Op0.getOperand(0).getValueType();
11674 if (Op0VT != MVT::v16i8)
11675 return SDValue();
11676
11677 SDLoc DL(Op0);
11678 SDValue Ones = DAG.getConstant(1, DL, Op0VT);
11681 ? Intrinsic::aarch64_neon_udot
11682 : Intrinsic::aarch64_neon_sdot;
11683 SDValue Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Zeros.getValueType(),
11685 Ones, Op0.getOperand(0));
11686 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
11687}
11688
11689// Given a ABS node, detect the following pattern:
11690// (ABS (SUB (EXTEND a), (EXTEND b))).
11691// Generates UABD/SABD instruction.
11694 const AArch64Subtarget *Subtarget) {
11695 SDValue AbsOp1 = N->getOperand(0);
11696 SDValue Op0, Op1;
11697
11698 if (AbsOp1.getOpcode() != ISD::SUB)
11699 return SDValue();
11700
11701 Op0 = AbsOp1.getOperand(0);
11702 Op1 = AbsOp1.getOperand(1);
11703
11704 unsigned Opc0 = Op0.getOpcode();
11705 // Check if the operands of the sub are (zero|sign)-extended.
11706 if (Opc0 != Op1.getOpcode() ||
11707 (Opc0 != ISD::ZERO_EXTEND && Opc0 != ISD::SIGN_EXTEND))
11708 return SDValue();
11709
11712 // Check if vectors are of same type and valid size.
11713 uint64_t Size = VectorT1.getFixedSizeInBits();
11714 if (VectorT1 != VectorT2 || (Size != 64 && Size != 128))
11715 return SDValue();
11716
11717 // Check if vector element types are valid.
11718 EVT VT1 = VectorT1.getVectorElementType();
11719 if (VT1 != MVT::i8 && VT1 != MVT::i16 && VT1 != MVT::i32)
11720 return SDValue();
11721
11722 Op0 = Op0.getOperand(0);
11723 Op1 = Op1.getOperand(0);
11724 unsigned ABDOpcode =
11726 SDValue ABD =
11727 DAG.getNode(ABDOpcode, SDLoc(N), Op0->getValueType(0), Op0, Op1);
11728 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), ABD);
11729}
11730
11733 const AArch64Subtarget *Subtarget) {
11734 if (DCI.isBeforeLegalizeOps())
11735 return SDValue();
11736
11737 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
11738}
11739
11740SDValue
11741AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
11742 SelectionDAG &DAG,
11745 if (isIntDivCheap(N->getValueType(0), Attr))
11746 return SDValue(N,0); // Lower SDIV as SDIV
11747
11748 // fold (sdiv X, pow2)
11749 EVT VT = N->getValueType(0);
11750 if ((VT != MVT::i32 && VT != MVT::i64) ||
11751 !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
11752 return SDValue();
11753
11754 SDLoc DL(N);
11755 SDValue N0 = N->getOperand(0);
11756 unsigned Lg2 = Divisor.countTrailingZeros();
11757 SDValue Zero = DAG.getConstant(0, DL, VT);
11758 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
11759
11760 // Add (N0 < 0) ? Pow2 - 1 : 0;
11761 SDValue CCVal;
11762 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
11763 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
11764 SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
11765
11766 Created.push_back(Cmp.getNode());
11767 Created.push_back(Add.getNode());
11768 Created.push_back(CSel.getNode());
11769
11770 // Divide by pow2.
11771 SDValue SRA =
11772 DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
11773
11774 // If we're dividing by a positive value, we're done. Otherwise, we must
11775 // negate the result.
11776 if (Divisor.isNonNegative())
11777 return SRA;
11778
11779 Created.push_back(SRA.getNode());
11780 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
11781}
11782
11784 switch(getIntrinsicID(S.getNode())) {
11785 default:
11786 break;
11787 case Intrinsic::aarch64_sve_cntb:
11788 case Intrinsic::aarch64_sve_cnth:
11789 case Intrinsic::aarch64_sve_cntw:
11790 case Intrinsic::aarch64_sve_cntd:
11791 return true;
11792 }
11793 return false;
11794}
11795
11796/// Calculates what the pre-extend type is, based on the extension
11797/// operation node provided by \p Extend.
11798///
11799/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
11800/// pre-extend type is pulled directly from the operand, while other extend
11801/// operations need a bit more inspection to get this information.
11802///
11803/// \param Extend The SDNode from the DAG that represents the extend operation
11804/// \param DAG The SelectionDAG hosting the \p Extend node
11805///
11806/// \returns The type representing the \p Extend source type, or \p MVT::Other
11807/// if no valid type can be determined
11809 switch (Extend.getOpcode()) {
11810 case ISD::SIGN_EXTEND:
11811 case ISD::ZERO_EXTEND:
11812 return Extend.getOperand(0).getValueType();
11813 case ISD::AssertSext:
11814 case ISD::AssertZext:
11816 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
11817 if (!TypeNode)
11818 return MVT::Other;
11819 return TypeNode->getVT();
11820 }
11821 case ISD::AND: {
11823 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
11824 if (!Constant)
11825 return MVT::Other;
11826
11827 uint32_t Mask = Constant->getZExtValue();
11828
11829 if (Mask == UCHAR_MAX)
11830 return MVT::i8;
11831 else if (Mask == USHRT_MAX)
11832 return MVT::i16;
11833 else if (Mask == UINT_MAX)
11834 return MVT::i32;
11835
11836 return MVT::Other;
11837 }
11838 default:
11839 return MVT::Other;
11840 }
11841
11842 llvm_unreachable("Code path unhandled in calculatePreExtendType!");
11843}
11844
11845/// Combines a dup(sext/zext) node pattern into sext/zext(dup)
11846/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
11848 SelectionDAG &DAG) {
11849
11852 if (!ShuffleNode)
11853 return SDValue();
11854
11855 // Ensuring the mask is zero before continuing
11856 if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0)
11857 return SDValue();
11858
11859 SDValue InsertVectorElt = VectorShuffle.getOperand(0);
11860
11861 if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
11862 return SDValue();
11863
11864 SDValue InsertLane = InsertVectorElt.getOperand(2);
11866 // Ensures the insert is inserting into lane 0
11867 if (!Constant || Constant->getZExtValue() != 0)
11868 return SDValue();
11869
11870 SDValue Extend = InsertVectorElt.getOperand(1);
11871 unsigned ExtendOpcode = Extend.getOpcode();
11872
11873 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
11874 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
11875 ExtendOpcode == ISD::AssertSext;
11876 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
11877 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
11878 return SDValue();
11879
11880 EVT TargetType = VectorShuffle.getValueType();
11882
11883 if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 &&
11884 TargetType != MVT::v2i64) ||
11886 return SDValue();
11887
11888 // Restrict valid pre-extend data type
11891 return SDValue();
11892
11894
11895 if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount())
11896 return SDValue();
11897
11898 if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2)
11899 return SDValue();
11900
11902
11905 DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType),
11906 DAG.getConstant(0, DL, MVT::i64));
11907
11908 std::vector<int> ShuffleMask(TargetType.getVectorElementCount().getValue());
11909
11912 DAG.getUNDEF(PreExtendVT), ShuffleMask);
11913
11915 DL, TargetType, VectorShuffleNode);
11916
11917 return ExtendNode;
11918}
11919
11920/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
11921/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
11923 // If the value type isn't a vector, none of the operands are going to be dups
11924 if (!Mul->getValueType(0).isVector())
11925 return SDValue();
11926
11927 SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG);
11928 SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG);
11929
11930 // Neither operands have been changed, don't make any further changes
11931 if (!Op0 && !Op1)
11932 return SDValue();
11933
11934 SDLoc DL(Mul);
11935 return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0),
11936 Op0 ? Op0 : Mul->getOperand(0),
11937 Op1 ? Op1 : Mul->getOperand(1));
11938}
11939
11942 const AArch64Subtarget *Subtarget) {
11943
11944 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
11945 return Ext;
11946
11947 if (DCI.isBeforeLegalizeOps())
11948 return SDValue();
11949
11950 // The below optimizations require a constant RHS.
11951 if (!isa<ConstantSDNode>(N->getOperand(1)))
11952 return SDValue();
11953
11954 SDValue N0 = N->getOperand(0);
11955 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
11956 const APInt &ConstValue = C->getAPIntValue();
11957
11958 // Allow the scaling to be folded into the `cnt` instruction by preventing
11959 // the scaling to be obscured here. This makes it easier to pattern match.
11960 if (IsSVECntIntrinsic(N0) ||
11961 (N0->getOpcode() == ISD::TRUNCATE &&
11962 (IsSVECntIntrinsic(N0->getOperand(0)))))
11963 if (ConstValue.sge(1) && ConstValue.sle(16))
11964 return SDValue();
11965
11966 // Multiplication of a power of two plus/minus one can be done more
11967 // cheaply as as shift+add/sub. For now, this is true unilaterally. If
11968 // future CPUs have a cheaper MADD instruction, this may need to be
11969 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
11970 // 64-bit is 5 cycles, so this is always a win.
11971 // More aggressively, some multiplications N0 * C can be lowered to
11972 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
11973 // e.g. 6=3*2=(2+1)*2.
11974 // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
11975 // which equals to (1+2)*16-(1+2).
11976 // TrailingZeroes is used to test if the mul can be lowered to
11977 // shift+add+shift.
11978 unsigned TrailingZeroes = ConstValue.countTrailingZeros();
11979 if (TrailingZeroes) {
11980 // Conservatively do not lower to shift+add+shift if the mul might be
11981 // folded into smul or umul.
11982 if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
11983 isZeroExtended(N0.getNode(), DAG)))
11984 return SDValue();
11985 // Conservatively do not lower to shift+add+shift if the mul might be
11986 // folded into madd or msub.
11987 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
11988 N->use_begin()->getOpcode() == ISD::SUB))
11989 return SDValue();
11990 }
11991 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
11992 // and shift+add+shift.
11994
11995 unsigned ShiftAmt, AddSubOpc;
11996 // Is the shifted value the LHS operand of the add/sub?
11997 bool ShiftValUseIsN0 = true;
11998 // Do we need to negate the result?
11999 bool NegateResult = false;
12000
12001 if (ConstValue.isNonNegative()) {
12002 // (mul x, 2^N + 1) => (add (shl x, N), x)
12003 // (mul x, 2^N - 1) => (sub (shl x, N), x)
12004 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
12006 APInt CVPlus1 = ConstValue + 1;
12007 if (SCVMinus1.isPowerOf2()) {
12008 ShiftAmt = SCVMinus1.logBase2();
12009 AddSubOpc = ISD::ADD;
12010 } else if (CVPlus1.isPowerOf2()) {
12011 ShiftAmt = CVPlus1.logBase2();
12012 AddSubOpc = ISD::SUB;
12013 } else
12014 return SDValue();
12015 } else {
12016 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
12017 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
12020 if (CVNegPlus1.isPowerOf2()) {
12021 ShiftAmt = CVNegPlus1.logBase2();
12022 AddSubOpc = ISD::SUB;
12023 ShiftValUseIsN0 = false;
12024 } else if (CVNegMinus1.isPowerOf2()) {
12025 ShiftAmt = CVNegMinus1.logBase2();
12026 AddSubOpc = ISD::ADD;
12027 NegateResult = true;
12028 } else
12029 return SDValue();
12030 }
12031
12032 SDLoc DL(N);
12033 EVT VT = N->getValueType(0);
12034 SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
12035 DAG.getConstant(ShiftAmt, DL, MVT::i64));
12036
12039 SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
12041 "NegateResult and TrailingZeroes cannot both be true for now.");
12042 // Negate the result.
12043 if (NegateResult)
12044 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
12045 // Shift the result.
12046 if (TrailingZeroes)
12047 return DAG.getNode(ISD::SHL, DL, VT, Res,
12049 return Res;
12050}
12051
12053 SelectionDAG &DAG) {
12054 // Take advantage of vector comparisons producing 0 or -1 in each lane to
12055 // optimize away operation when it's from a constant.
12056 //
12057 // The general transformation is:
12058 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
12059 // AND(VECTOR_CMP(x,y), constant2)
12060 // constant2 = UNARYOP(constant)
12061
12062 // Early exit if this isn't a vector operation, the operand of the
12063 // unary operation isn't a bitwise AND, or if the sizes of the operations
12064 // aren't the same.
12065 EVT VT = N->getValueType(0);
12066 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
12067 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
12068 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
12069 return SDValue();
12070
12071 // Now check that the other operand of the AND is a constant. We could
12072 // make the transformation for non-constant splats as well, but it's unclear
12073 // that would be a benefit as it would not eliminate any operations, just
12074 // perform one more step in scalar code before moving to the vector unit.
12075 if (BuildVectorSDNode *BV =
12076 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
12077 // Bail out if the vector isn't a constant.
12078 if (!BV->isConstant())
12079 return SDValue();
12080
12081 // Everything checks out. Build up the new and improved node.
12082 SDLoc DL(N);
12083 EVT IntVT = BV->getValueType(0);
12084 // Create a new constant of the appropriate type for the transformed
12085 // DAG.
12086 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
12087 // The AND node needs bitcasts to/from an integer vector type around it.
12090 N->getOperand(0)->getOperand(0), MaskConst);
12091 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
12092 return Res;
12093 }
12094
12095 return SDValue();
12096}
12097
12099 const AArch64Subtarget *Subtarget) {
12100 // First try to optimize away the conversion when it's conditionally from
12101 // a constant. Vectors only.
12103 return Res;
12104
12105 EVT VT = N->getValueType(0);
12106 if (VT != MVT::f32 && VT != MVT::f64)
12107 return SDValue();
12108
12109 // Only optimize when the source and destination types have the same width.
12110 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
12111 return SDValue();
12112
12113 // If the result of an integer load is only used by an integer-to-float
12114 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
12115 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
12116 SDValue N0 = N->getOperand(0);
12117 if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
12118 // Do not change the width of a volatile load.
12119 !cast<LoadSDNode>(N0)->isVolatile()) {
12121 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
12122 LN0->getPointerInfo(), LN0->getAlignment(),
12123 LN0->getMemOperand()->getFlags());
12124
12125 // Make sure successors of the original load stay after it by updating them
12126 // to use the new Chain.
12127 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
12128
12129 unsigned Opcode =
12131 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
12132 }
12133
12134 return SDValue();
12135}
12136
12137/// Fold a floating-point multiply by power of two into floating-point to
12138/// fixed-point conversion.
12141 const AArch64Subtarget *Subtarget) {
12142 if (!Subtarget->hasNEON())
12143 return SDValue();
12144
12145 if (!N->getValueType(0).isSimple())
12146 return SDValue();
12147
12148 SDValue Op = N->getOperand(0);
12149 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
12150 Op.getOpcode() != ISD::FMUL)
12151 return SDValue();
12152
12153 SDValue ConstVec = Op->getOperand(1);
12155 return SDValue();
12156
12157 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
12158 uint32_t FloatBits = FloatTy.getSizeInBits();
12159 if (FloatBits != 32 && FloatBits != 64)
12160 return SDValue();
12161
12162 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
12163 uint32_t IntBits = IntTy.getSizeInBits();
12164 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
12165 return SDValue();
12166
12167 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
12168 if (IntBits > FloatBits)
12169 return SDValue();
12170
12173 int32_t Bits = IntBits == 64 ? 64 : 32;
12174 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
12175 if (C == -1 || C == 0 || C > Bits)
12176 return SDValue();
12177
12178 MVT ResTy;
12179 unsigned NumLanes = Op.getValueType().getVectorNumElements();
12180 switch (NumLanes) {
12181 default:
12182 return SDValue();
12183 case 2:
12185 break;
12186 case 4:
12188 break;
12189 }
12190
12191 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
12192 return SDValue();
12193
12194 assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
12195 "Illegal vector type after legalization");
12196
12197 SDLoc DL(N);
12198 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
12199 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
12200 : Intrinsic::aarch64_neon_vcvtfp2fxu;
12204 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
12205 // We can handle smaller integers by generating an extra trunc.
12206 if (IntBits < FloatBits)
12207 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
12208
12209 return FixConv;
12210}
12211
12212/// Fold a floating-point divide by power of two into fixed-point to
12213/// floating-point conversion.
12216 const AArch64Subtarget *Subtarget) {
12217 if (!Subtarget->hasNEON())
12218 return SDValue();
12219
12220 SDValue Op = N->getOperand(0);
12221 unsigned Opc = Op->getOpcode();
12222 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
12223 !Op.getOperand(0).getValueType().isSimple() ||
12224 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
12225 return SDValue();
12226
12227 SDValue ConstVec = N->getOperand(1);
12229 return SDValue();
12230
12231 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
12232 int32_t IntBits = IntTy.getSizeInBits();
12233 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
12234 return SDValue();
12235
12236 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
12237 int32_t FloatBits = FloatTy.getSizeInBits();
12238 if (FloatBits != 32 && FloatBits != 64)
12239 return SDValue();
12240
12241 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
12242 if (IntBits > FloatBits)
12243 return SDValue();
12244
12247 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
12248 if (C == -1 || C == 0 || C > FloatBits)
12249 return SDValue();
12250
12251 MVT ResTy;
12252 unsigned NumLanes = Op.getValueType().getVectorNumElements();
12253 switch (NumLanes) {
12254 default:
12255 return SDValue();
12256 case 2:
12258 break;
12259 case 4:
12261 break;
12262 }
12263
12264 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
12265 return SDValue();
12266
12267 SDLoc DL(N);
12268 SDValue ConvInput = Op.getOperand(0);
12269 bool IsSigned = Opc == ISD::SINT_TO_FP;
12270 if (IntBits < FloatBits)
12272 ResTy, ConvInput);
12273
12274 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
12275 : Intrinsic::aarch64_neon_vcvtfxu2fp;
12276 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
12278 DAG.getConstant(C, DL, MVT::i32));
12279}
12280
12281/// An EXTR instruction is made up of two shifts, ORed together. This helper
12282/// searches for and classifies those shifts.
12283static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
12284 bool &FromHi) {
12285 if (N.getOpcode() == ISD::SHL)
12286 FromHi = false;
12287 else if (N.getOpcode() == ISD::SRL)
12288 FromHi = true;
12289 else
12290 return false;
12291
12292 if (!isa<ConstantSDNode>(N.getOperand(1)))
12293 return false;
12294
12295 ShiftAmount = N->getConstantOperandVal(1);
12296 Src = N->getOperand(0);
12297 return true;
12298}
12299
12300/// EXTR instruction extracts a contiguous chunk of bits from two existing
12301/// registers viewed as a high/low pair. This function looks for the pattern:
12302/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
12303/// with an EXTR. Can't quite be done in TableGen because the two immediates
12304/// aren't independent.
12307 SelectionDAG &DAG = DCI.DAG;
12308 SDLoc DL(N);
12309 EVT VT = N->getValueType(0);
12310
12311 assert(N->getOpcode() == ISD::OR && "Unexpected root");
12312
12313 if (VT != MVT::i32 && VT != MVT::i64)
12314 return SDValue();
12315
12316 SDValue LHS;
12317 uint32_t ShiftLHS = 0;
12318 bool LHSFromHi = false;
12319 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
12320 return SDValue();
12321
12322 SDValue RHS;
12323 uint32_t ShiftRHS = 0;
12324 bool RHSFromHi = false;
12325 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
12326 return SDValue();
12327
12328 // If they're both trying to come from the high part of the register, they're
12329 // not really an EXTR.
12330 if (LHSFromHi == RHSFromHi)
12331 return SDValue();
12332
12333 if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
12334 return SDValue();
12335
12336 if (LHSFromHi) {
12337 std::swap(LHS, RHS);
12339 }
12340
12341 return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
12343}
12344
12347 EVT VT = N->getValueType(0);
12348 SelectionDAG &DAG = DCI.DAG;
12349 SDLoc DL(N);
12350
12351 if (!VT.isVector())
12352 return SDValue();
12353
12354 SDValue N0 = N->getOperand(0);
12355 if (N0.getOpcode() != ISD::AND)
12356 return SDValue();
12357
12358 SDValue N1 = N->getOperand(1);
12359 if (N1.getOpcode() != ISD::AND)
12360 return SDValue();
12361
12362 // We only have to look for constant vectors here since the general, variable
12363 // case can be handled in TableGen.
12364 unsigned Bits = VT.getScalarSizeInBits();
12365 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
12366 for (int i = 1; i >= 0; --i)
12367 for (int j = 1; j >= 0; --j) {
12370 if (!BVN0 || !BVN1)
12371 continue;
12372
12373 bool FoundMatch = true;
12374 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
12377 if (!CN0 || !CN1 ||
12378 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
12379 FoundMatch = false;
12380 break;
12381 }
12382 }
12383
12384 if (FoundMatch)
12385 return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
12386 N0->getOperand(1 - i), N1->getOperand(1 - j));
12387 }
12388
12389 return SDValue();
12390}
12391
12393 const AArch64Subtarget *Subtarget) {
12394 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
12395 SelectionDAG &DAG = DCI.DAG;
12396 EVT VT = N->getValueType(0);
12397
12398 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
12399 return SDValue();
12400
12401 if (SDValue Res = tryCombineToEXTR(N, DCI))
12402 return Res;
12403
12404 if (SDValue Res = tryCombineToBSL(N, DCI))
12405 return Res;
12406
12407 return SDValue();
12408}
12409
12411 if (!MemVT.getVectorElementType().isSimple())
12412 return false;
12413
12414 uint64_t MaskForTy = 0ull;
12415 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
12416 case MVT::i8:
12417 MaskForTy = 0xffull;
12418 break;
12419 case MVT::i16:
12420 MaskForTy = 0xffffull;
12421 break;
12422 case MVT::i32:
12423 MaskForTy = 0xffffffffull;
12424 break;
12425 default:
12426 return false;
12427 break;
12428 }
12429
12430 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
12431 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
12432 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
12433
12434 return false;
12435}
12436
12439 if (DCI.isBeforeLegalizeOps())
12440 return SDValue();
12441
12442 SelectionDAG &DAG = DCI.DAG;
12443 SDValue Src = N->getOperand(0);
12444 unsigned Opc = Src->getOpcode();
12445
12446 // Zero/any extend of an unsigned unpack
12447 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
12448 SDValue UnpkOp = Src->getOperand(0);
12449 SDValue Dup = N->getOperand(1);
12450
12451 if (Dup.getOpcode() != AArch64ISD::DUP)
12452 return SDValue();
12453
12454 SDLoc DL(N);
12455 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
12456 uint64_t ExtVal = C->getZExtValue();
12457
12458 // If the mask is fully covered by the unpack, we don't need to push
12459 // a new AND onto the operand
12460 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
12461 if ((ExtVal == 0xFF && EltTy == MVT::i8) ||
12462 (ExtVal == 0xFFFF && EltTy == MVT::i16) ||
12463 (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
12464 return Src;
12465
12466 // Truncate to prevent a DUP with an over wide constant
12467 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
12468
12469 // Otherwise, make sure we propagate the AND to the operand
12470 // of the unpack
12472 UnpkOp->getValueType(0),
12473 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
12474
12475 SDValue And = DAG.getNode(ISD::AND, DL,
12476 UnpkOp->getValueType(0), UnpkOp, Dup);
12477
12478 return DAG.getNode(Opc, DL, N->getValueType(0), And);
12479 }
12480
12482 return SDValue();
12483
12484 SDValue Mask = N->getOperand(1);
12485
12486 if (!Src.hasOneUse())
12487 return SDValue();
12488
12489 EVT MemVT;
12490
12491 // SVE load instructions perform an implicit zero-extend, which makes them
12492 // perfect candidates for combining.
12493 switch (Opc) {
12497 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
12498 break;
12514 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
12515 break;
12516 default:
12517 return SDValue();
12518 }
12519
12520 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
12521 return Src;
12522
12523 return SDValue();
12524}
12525
12528 SelectionDAG &DAG = DCI.DAG;
12529 SDValue LHS = N->getOperand(0);
12530 EVT VT = N->getValueType(0);
12531 if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT))
12532 return SDValue();
12533
12534 if (VT.isScalableVector())
12535 return performSVEAndCombine(N, DCI);
12536
12537 // The combining code below works only for NEON vectors. In particular, it
12538 // does not work for SVE when dealing with vectors wider than 128 bits.
12539 if (!(VT.is64BitVector() || VT.is128BitVector()))
12540 return SDValue();
12541
12543 dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
12544 if (!BVN)
12545 return SDValue();
12546
12547 // AND does not accept an immediate, so check if we can use a BIC immediate
12548 // instruction instead. We do this here instead of using a (and x, (mvni imm))
12549 // pattern in isel, because some immediates may be lowered to the preferred
12550 // (and x, (movi imm)) form, even though an mvni representation also exists.
12551 APInt DefBits(VT.getSizeInBits(), 0);
12552 APInt UndefBits(VT.getSizeInBits(), 0);
12554 SDValue NewOp;
12555
12556 DefBits = ~DefBits;
12558 DefBits, &LHS)) ||
12560 DefBits, &LHS)))
12561 return NewOp;
12562
12565 UndefBits, &LHS)) ||
12567 UndefBits, &LHS)))
12568 return NewOp;
12569 }
12570
12571 return SDValue();
12572}
12573
12576 SelectionDAG &DAG = DCI.DAG;
12577 EVT VT = N->getValueType(0);
12578 if (VT != MVT::i32 && VT != MVT::i64)
12579 return SDValue();
12580
12581 // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
12582 // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
12583 // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
12584 SDValue N0 = N->getOperand(0);
12585 if (N0.getOpcode() == ISD::BSWAP) {
12586 SDLoc DL(N);
12587 SDValue N1 = N->getOperand(1);
12588 SDValue N00 = N0.getOperand(0);
12590 uint64_t ShiftAmt = C->getZExtValue();
12591 if (VT == MVT::i32 && ShiftAmt == 16 &&
12593 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
12594 if (VT == MVT::i64 && ShiftAmt == 32 &&
12596 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
12597 }
12598 }
12599 return SDValue();
12600}
12601
12602// Attempt to form urhadd(OpA, OpB) from
12603// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1))
12604// or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)).
12605// The original form of the first expression is
12606// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and the
12607// (OpA + OpB + 1) subexpression will have been changed to (OpB - (~OpA)).
12608// Before this function is called the srl will have been lowered to
12609// AArch64ISD::VLSHR.
12610// This pass can also recognize signed variants of the patterns that use sign
12611// extension instead of zero extension and form a srhadd(OpA, OpB) or a
12612// shadd(OpA, OpB) from them.
12613static SDValue
12615 SelectionDAG &DAG) {
12616 EVT VT = N->getValueType(0);
12617
12618 // Since we are looking for a right shift by a constant value of 1 and we are
12619 // operating on types at least 16 bits in length (sign/zero extended OpA and
12620 // OpB, which are at least 8 bits), it follows that the truncate will always
12621 // discard the shifted-in bit and therefore the right shift will be logical
12622 // regardless of the signedness of OpA and OpB.
12623 SDValue Shift = N->getOperand(0);
12624 if (Shift.getOpcode() != AArch64ISD::VLSHR)
12625 return SDValue();
12626
12627 // Is the right shift using an immediate value of 1?
12628 uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
12629 if (ShiftAmount != 1)
12630 return SDValue();
12631
12633 SDValue ShiftOp0 = Shift.getOperand(0);
12634 unsigned ShiftOp0Opc = ShiftOp0.getOpcode();
12635 if (ShiftOp0Opc == ISD::SUB) {
12636
12637 SDValue Xor = ShiftOp0.getOperand(1);
12638 if (Xor.getOpcode() != ISD::XOR)
12639 return SDValue();
12640
12641 // Is the XOR using a constant amount of all ones in the right hand side?
12642 uint64_t C;
12643 if (!isAllConstantBuildVector(Xor.getOperand(1), C))
12644 return SDValue();
12645
12646 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
12649 return SDValue();
12650
12651 ExtendOpA = Xor.getOperand(0);
12652 ExtendOpB = ShiftOp0.getOperand(0);
12653 } else if (ShiftOp0Opc == ISD::ADD) {
12654 ExtendOpA = ShiftOp0.getOperand(0);
12655 ExtendOpB = ShiftOp0.getOperand(1);
12656 } else
12657 return SDValue();
12658
12659 unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
12660 unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
12661 if (!(ExtendOpAOpc == ExtendOpBOpc &&
12663 return SDValue();
12664
12665 // Is the result of the right shift being truncated to the same value type as
12666 // the original operands, OpA and OpB?
12667 SDValue OpA = ExtendOpA.getOperand(0);
12668 SDValue OpB = ExtendOpB.getOperand(0);
12669 EVT OpAVT = OpA.getValueType();
12670 assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
12671 if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
12672 return SDValue();
12673
12674 SDLoc DL(N);
12676 bool IsRHADD = ShiftOp0Opc == ISD::SUB;
12677 unsigned HADDOpc = IsSignExtend
12680 SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB);
12681
12682 return ResultHADD;
12683}
12684
12685static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
12686 switch (Opcode) {
12687 case ISD::FADD:
12688 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
12689 case ISD::ADD:
12690 return VT == MVT::i64;
12691 default:
12692 return false;
12693 }
12694}
12695
12697 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
12699
12700 EVT VT = N->getValueType(0);
12701 const bool FullFP16 =
12702 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
12703
12704 // Rewrite for pairwise fadd pattern
12705 // (f32 (extract_vector_elt
12706 // (fadd (vXf32 Other)
12707 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
12708 // ->
12709 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
12710 // (extract_vector_elt (vXf32 Other) 1))
12711 if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
12712 hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) {
12713 SDLoc DL(N0);
12714 SDValue N00 = N0->getOperand(0);
12715 SDValue N01 = N0->getOperand(1);
12716
12718 SDValue Other = N00;
12719
12720 // And handle the commutative case.
12721 if (!Shuffle) {
12723 Other = N01;
12724 }
12725
12726 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
12727 Other == Shuffle->getOperand(0)) {
12728 return DAG.getNode(N0->getOpcode(), DL, VT,
12730 DAG.getConstant(0, DL, MVT::i64)),
12732 DAG.getConstant(1, DL, MVT::i64)));
12733 }
12734 }
12735
12736 return SDValue();
12737}
12738
12741 SelectionDAG &DAG) {
12742 SDLoc dl(N);
12743 EVT VT = N->getValueType(0);
12744 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
12745 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
12746
12747 // Optimize concat_vectors of truncated vectors, where the intermediate
12748 // type is illegal, to avoid said illegality, e.g.,
12749 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
12750 // (v2i16 (truncate (v2i64)))))
12751 // ->
12752 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
12753 // (v4i32 (bitcast (v2i64))),
12754 // <0, 2, 4, 6>)))
12755 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
12756 // on both input and result type, so we might generate worse code.
12757 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
12758 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
12759 N1Opc == ISD::TRUNCATE) {
12760 SDValue N00 = N0->getOperand(0);
12761 SDValue N10 = N1->getOperand(0);
12762 EVT N00VT = N00.getValueType();
12763
12764 if (N00VT == N10.getValueType() &&
12765 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
12766 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
12768 SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
12769 for (size_t i = 0; i < Mask.size(); ++i)
12770 Mask[i] = i * 2;
12771 return DAG.getNode(ISD::TRUNCATE, dl, VT,
12772 DAG.getVectorShuffle(
12773 MidVT, dl,
12774 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
12775 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
12776 }
12777 }
12778
12779 // Wait 'til after everything is legalized to try this. That way we have
12780 // legal vector types and such.
12781 if (DCI.isBeforeLegalizeOps())
12782 return SDValue();
12783
12784 // Optimise concat_vectors of two [us]rhadds or [us]hadds that use extracted
12785 // subvectors from the same original vectors. Combine these into a single
12786 // [us]rhadd or [us]hadd that operates on the two original vectors. Example:
12787 // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
12788 // extract_subvector (v16i8 OpB,
12789 // <0>))),
12790 // (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
12791 // extract_subvector (v16i8 OpB,
12792 // <8>)))))
12793 // ->
12794 // (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
12795 if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
12798 SDValue N00 = N0->getOperand(0);
12799 SDValue N01 = N0->getOperand(1);
12800 SDValue N10 = N1->getOperand(0);
12801 SDValue N11 = N1->getOperand(1);
12802
12803 EVT N00VT = N00.getValueType();
12804 EVT N10VT = N10.getValueType();
12805
12806 if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12807 N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12808 N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12809 N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
12810 SDValue N00Source = N00->getOperand(0);
12811 SDValue N01Source = N01->getOperand(0);
12812 SDValue N10Source = N10->getOperand(0);
12813 SDValue N11Source = N11->getOperand(0);
12814
12815 if (N00Source == N10Source && N01Source == N11Source &&
12816 N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
12817 assert(N0.getValueType() == N1.getValueType());
12818
12819 uint64_t N00Index = N00.getConstantOperandVal(1);
12820 uint64_t N01Index = N01.getConstantOperandVal(1);
12821 uint64_t N10Index = N10.getConstantOperandVal(1);
12822 uint64_t N11Index = N11.getConstantOperandVal(1);
12823
12824 if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
12825 N10Index == N00VT.getVectorNumElements())
12826 return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
12827 }
12828 }
12829 }
12830
12831 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
12832 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
12833 // canonicalise to that.
12834 if (N0 == N1 && VT.getVectorNumElements() == 2) {
12835 assert(VT.getScalarSizeInBits() == 64);
12836 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
12837 DAG.getConstant(0, dl, MVT::i64));
12838 }
12839
12840 // Canonicalise concat_vectors so that the right-hand vector has as few
12841 // bit-casts as possible before its real operation. The primary matching
12842 // destination for these operations will be the narrowing "2" instructions,
12843 // which depend on the operation being performed on this right-hand vector.
12844 // For example,
12845 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
12846 // becomes
12847 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
12848
12849 if (N1Opc != ISD::BITCAST)
12850 return SDValue();
12851 SDValue RHS = N1->getOperand(0);
12852 MVT RHSTy = RHS.getValueType().getSimpleVT();
12853 // If the RHS is not a vector, this is not the pattern we're looking for.
12854 if (!RHSTy.isVector())
12855 return SDValue();
12856
12857 LLVM_DEBUG(
12858 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
12859
12860 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
12861 RHSTy.getVectorNumElements() * 2);
12862 return DAG.getNode(ISD::BITCAST, dl, VT,
12864 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
12865 RHS));
12866}
12867
12870 SelectionDAG &DAG) {
12871 // Wait until after everything is legalized to try this. That way we have
12872 // legal vector types and such.
12873 if (DCI.isBeforeLegalizeOps())
12874 return SDValue();
12875 // Transform a scalar conversion of a value from a lane extract into a
12876 // lane extract of a vector conversion. E.g., from foo1 to foo2:
12877 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
12878 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
12879 //
12880 // The second form interacts better with instruction selection and the
12881 // register allocator to avoid cross-class register copies that aren't
12882 // coalescable due to a lane reference.
12883
12884 // Check the operand and see if it originates from a lane extract.
12885 SDValue Op1 = N->getOperand(1);
12886 if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
12887 // Yep, no additional predication needed. Perform the transform.
12888 SDValue IID = N->getOperand(0);
12889 SDValue Shift = N->getOperand(2);
12890 SDValue Vec = Op1.getOperand(0);
12891 SDValue Lane = Op1.getOperand(1);
12892 EVT ResTy = N->getValueType(0);
12893 EVT VecResTy;
12894 SDLoc DL(N);
12895
12896 // The vector width should be 128 bits by the time we get here, even
12897 // if it started as 64 bits (the extract_vector handling will have
12898 // done so).
12899 assert(Vec.getValueSizeInBits() == 128 &&
12900 "unexpected vector size on extract_vector_elt!");
12901 if (Vec.getValueType() == MVT::v4i32)
12903 else if (Vec.getValueType() == MVT::v2i64)
12905 else
12906 llvm_unreachable("unexpected vector type!");
12907
12908 SDValue Convert =
12909 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
12910 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
12911 }
12912 return SDValue();
12913}
12914
12915// AArch64 high-vector "long" operations are formed by performing the non-high
12916// version on an extract_subvector of each operand which gets the high half:
12917//
12918// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
12919//
12920// However, there are cases which don't have an extract_high explicitly, but
12921// have another operation that can be made compatible with one for free. For
12922// example:
12923//
12924// (dupv64 scalar) --> (extract_high (dup128 scalar))
12925//
12926// This routine does the actual conversion of such DUPs, once outer routines
12927// have determined that everything else is in order.
12928// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
12929// similarly here.
12931 switch (N.getOpcode()) {
12932 case AArch64ISD::DUP:
12937 case AArch64ISD::MOVI:
12943 break;
12944 default:
12945 // FMOV could be supported, but isn't very useful, as it would only occur
12946 // if you passed a bitcast' floating point immediate to an eligible long
12947 // integer op (addl, smull, ...).
12948 return SDValue();
12949 }
12950
12951 MVT NarrowTy = N.getSimpleValueType();
12952 if (!NarrowTy.is64BitVector())
12953 return SDValue();
12954
12955 MVT ElementTy = NarrowTy.getVectorElementType();
12956 unsigned NumElems = NarrowTy.getVectorNumElements();
12957 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
12958
12959 SDLoc dl(N);
12960 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
12961 DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
12962 DAG.getConstant(NumElems, dl, MVT::i64));
12963}
12964
12966 if (N.getOpcode() == ISD::BITCAST)
12967 N = N.getOperand(0);
12968 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
12969 return false;
12970 return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
12971 N.getOperand(0).getValueType().getVectorNumElements() / 2;
12972}
12973
12974/// Helper structure to keep track of ISD::SET_CC operands.
12980
12981/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
12986
12987/// Helper structure to keep track of SetCC information.
12992
12993/// Helper structure to be able to read SetCC information. If set to
12994/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
12995/// GenericSetCCInfo.
13000
13001/// Check whether or not \p Op is a SET_CC operation, either a generic or
13002/// an
13003/// AArch64 lowered one.
13004/// \p SetCCInfo is filled accordingly.
13005/// \post SetCCInfo is meanginfull only when this function returns true.
13006/// \return True when Op is a kind of SET_CC operation.
13008 // If this is a setcc, this is straight forward.
13009 if (Op.getOpcode() == ISD::SETCC) {
13010 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
13011 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
13012 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
13013 SetCCInfo.IsAArch64 = false;
13014 return true;
13015 }
13016 // Otherwise, check if this is a matching csel instruction.
13017 // In other words:
13018 // - csel 1, 0, cc
13019 // - csel 0, 1, !cc
13020 if (Op.getOpcode() != AArch64ISD::CSEL)
13021 return false;
13022 // Set the information about the operands.
13023 // TODO: we want the operands of the Cmp not the csel
13024 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
13025 SetCCInfo.IsAArch64 = true;
13026 SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
13027 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
13028
13029 // Check that the operands matches the constraints:
13030 // (1) Both operands must be constants.
13031 // (2) One must be 1 and the other must be 0.
13032 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
13033 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13034
13035 // Check (1).
13036 if (!TValue || !FValue)
13037 return false;
13038
13039 // Check (2).
13040 if (!TValue->isOne()) {
13041 // Update the comparison when we are interested in !cc.
13043 SetCCInfo.Info.AArch64.CC =
13045 }
13046 return TValue->isOne() && FValue->isNullValue();
13047}
13048
13049// Returns true if Op is setcc or zext of setcc.
13050static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
13051 if (isSetCC(Op, Info))
13052 return true;
13053 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
13054 isSetCC(Op->getOperand(0), Info));
13055}
13056
13057// The folding we want to perform is:
13058// (add x, [zext] (setcc cc ...) )
13059// -->
13060// (csel x, (add x, 1), !cc ...)
13061//
13062// The latter will get matched to a CSINC instruction.
13064 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
13065 SDValue LHS = Op->getOperand(0);
13066 SDValue RHS = Op->getOperand(1);
13068
13069 // If neither operand is a SET_CC, give up.
13070 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
13071 std::swap(LHS, RHS);
13073 return SDValue();
13074 }
13075
13076 // FIXME: This could be generatized to work for FP comparisons.
13077 EVT CmpVT = InfoAndKind.IsAArch64
13078 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
13079 : InfoAndKind.Info.Generic.Opnd0->getValueType();
13080 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
13081 return SDValue();
13082
13083 SDValue CCVal;
13084 SDValue Cmp;
13085 SDLoc dl(Op);
13086 if (InfoAndKind.IsAArch64) {
13087 CCVal = DAG.getConstant(
13088 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
13089 MVT::i32);
13090 Cmp = *InfoAndKind.Info.AArch64.Cmp;
13091 } else
13092 Cmp = getAArch64Cmp(
13093 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
13094 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
13095 dl);
13096
13097 EVT VT = Op->getValueType(0);
13098 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
13099 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
13100}
13101
13102// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
13104 EVT VT = N->getValueType(0);
13105 // Only scalar integer and vector types.
13106 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
13107 return SDValue();
13108
13109 SDValue LHS = N->getOperand(0);
13110 SDValue RHS = N->getOperand(1);
13111 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13112 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
13113 return SDValue();
13114
13115 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
13116 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
13117 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isNullValue())
13118 return SDValue();
13119
13120 SDValue Op1 = LHS->getOperand(0);
13121 SDValue Op2 = RHS->getOperand(0);
13122 EVT OpVT1 = Op1.getValueType();
13123 EVT OpVT2 = Op2.getValueType();
13124 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
13125 Op2.getOpcode() != AArch64ISD::UADDV ||
13126 OpVT1.getVectorElementType() != VT)
13127 return SDValue();
13128
13129 SDValue Val1 = Op1.getOperand(0);
13130 SDValue Val2 = Op2.getOperand(0);
13131 EVT ValVT = Val1->getValueType(0);
13132 SDLoc DL(N);
13133 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
13134 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
13135 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
13136 DAG.getConstant(0, DL, MVT::i64));
13137}
13138
13139// The basic add/sub long vector instructions have variants with "2" on the end
13140// which act on the high-half of their inputs. They are normally matched by
13141// patterns like:
13142//
13143// (add (zeroext (extract_high LHS)),
13144// (zeroext (extract_high RHS)))
13145// -> uaddl2 vD, vN, vM
13146//
13147// However, if one of the extracts is something like a duplicate, this
13148// instruction can still be used profitably. This function puts the DAG into a
13149// more appropriate form for those patterns to trigger.
13152 SelectionDAG &DAG) {
13153 if (DCI.isBeforeLegalizeOps())
13154 return SDValue();
13155
13156 MVT VT = N->getSimpleValueType(0);
13157 if (!VT.is128BitVector()) {
13158 if (N->getOpcode() == ISD::ADD)
13159 return performSetccAddFolding(N, DAG);
13160 return SDValue();
13161 }
13162
13163 // Make sure both branches are extended in the same way.
13164 SDValue LHS = N->getOperand(0);
13165 SDValue RHS = N->getOperand(1);
13166 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
13167 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
13168 LHS.getOpcode() != RHS.getOpcode())
13169 return SDValue();
13170
13171 unsigned ExtType = LHS.getOpcode();
13172
13173 // It's not worth doing if at least one of the inputs isn't already an
13174 // extract, but we don't know which it'll be so we have to try both.
13175 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
13176 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
13177 if (!RHS.getNode())
13178 return SDValue();
13179
13180 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
13181 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
13182 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
13183 if (!LHS.getNode())
13184 return SDValue();
13185
13186 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
13187 }
13188
13189 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
13190}
13191
13194 SelectionDAG &DAG) {
13195 // Try to change sum of two reductions.
13196 if (SDValue Val = performUADDVCombine(N, DAG))
13197 return Val;
13198
13199 return performAddSubLongCombine(N, DCI, DAG);
13200}
13201
13202// Massage DAGs which we can use the high-half "long" operations on into
13203// something isel will recognize better. E.g.
13204//
13205// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
13206// (aarch64_neon_umull (extract_high (v2i64 vec)))
13207// (extract_high (v2i64 (dup128 scalar)))))
13208//
13211 SelectionDAG &DAG) {
13212 if (DCI.isBeforeLegalizeOps())
13213 return SDValue();
13214
13215 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
13216 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
13217 assert(LHS.getValueType().is64BitVector() &&
13218 RHS.getValueType().is64BitVector() &&
13219 "unexpected shape for long operation");
13220
13221 // Either node could be a DUP, but it's not worth doing both of them (you'd
13222 // just as well use the non-high version) so look for a corresponding extract
13223 // operation on the other "wing".
13225 RHS = tryExtendDUPToExtractHigh(RHS, DAG);
13226 if (!RHS.getNode())
13227 return SDValue();
13228 } else if (isEssentiallyExtractHighSubvector(RHS)) {
13229 LHS = tryExtendDUPToExtractHigh(LHS, DAG);
13230 if (!LHS.getNode())
13231 return SDValue();
13232 }
13233
13234 if (IID == Intrinsic::not_intrinsic)
13235 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
13236
13237 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
13238 N->getOperand(0), LHS, RHS);
13239}
13240
13241static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
13242 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
13243 unsigned ElemBits = ElemTy.getSizeInBits();
13244
13245 int64_t ShiftAmount;
13246 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
13247 APInt SplatValue, SplatUndef;
13248 unsigned SplatBitSize;
13249 bool HasAnyUndefs;
13250 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
13252 SplatBitSize != ElemBits)
13253 return SDValue();
13254
13255 ShiftAmount = SplatValue.getSExtValue();
13256 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
13257 ShiftAmount = CVN->getSExtValue();
13258 } else
13259 return SDValue();
13260
13261 unsigned Opcode;
13262 bool IsRightShift;
13263 switch (IID) {
13264 default:
13265 llvm_unreachable("Unknown shift intrinsic");
13266 case Intrinsic::aarch64_neon_sqshl:
13267 Opcode = AArch64ISD::SQSHL_I;
13268 IsRightShift = false;
13269 break;
13270 case Intrinsic::aarch64_neon_uqshl:
13271 Opcode = AArch64ISD::UQSHL_I;
13272 IsRightShift = false;
13273 break;
13274 case Intrinsic::aarch64_neon_srshl:
13275 Opcode = AArch64ISD::SRSHR_I;
13276 IsRightShift = true;
13277 break;
13278 case Intrinsic::aarch64_neon_urshl:
13279 Opcode = AArch64ISD::URSHR_I;
13280 IsRightShift = true;
13281 break;
13282 case Intrinsic::aarch64_neon_sqshlu:
13283 Opcode = AArch64ISD::SQSHLU_I;
13284 IsRightShift = false;
13285 break;
13286 case Intrinsic::aarch64_neon_sshl:
13287 case Intrinsic::aarch64_neon_ushl:
13288 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
13289 // left shift for positive shift amounts. Below, we only replace the current
13290 // node with VSHL, if this condition is met.
13291 Opcode = AArch64ISD::VSHL;
13292 IsRightShift = false;
13293 break;
13294 }
13295
13297 SDLoc dl(N);
13298 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
13299 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
13300 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
13301 SDLoc dl(N);
13302 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
13303 DAG.getConstant(ShiftAmount, dl, MVT::i32));
13304 }
13305
13306 return SDValue();
13307}
13308
13309// The CRC32[BH] instructions ignore the high bits of their data operand. Since
13310// the intrinsics must be legal and take an i32, this means there's almost
13311// certainly going to be a zext in the DAG which we can eliminate.
13312static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
13313 SDValue AndN = N->getOperand(2);
13314 if (AndN.getOpcode() != ISD::AND)
13315 return SDValue();
13316
13318 if (!CMask || CMask->getZExtValue() != Mask)
13319 return SDValue();
13320
13322 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
13323}
13324
13326 SelectionDAG &DAG) {
13327 SDLoc dl(N);
13328 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
13329 DAG.getNode(Opc, dl,
13330 N->getOperand(1).getSimpleValueType(),
13331 N->getOperand(1)),
13332 DAG.getConstant(0, dl, MVT::i64));
13333}
13334
13336 SDLoc DL(N);
13337 SDValue Op1 = N->getOperand(1);
13338 SDValue Op2 = N->getOperand(2);
13339 EVT ScalarTy = Op1.getValueType();
13340
13341 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) {
13342 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
13343 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
13344 }
13345
13346 return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, N->getValueType(0),
13347 Op1, Op2);
13348}
13349
13351 SDLoc dl(N);
13352 SDValue Scalar = N->getOperand(3);
13353 EVT ScalarTy = Scalar.getValueType();
13354
13355 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
13356 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
13357
13358 SDValue Passthru = N->getOperand(1);
13359 SDValue Pred = N->getOperand(2);
13360 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
13361 Pred, Scalar, Passthru);
13362}
13363
13365 SDLoc dl(N);
13366 LLVMContext &Ctx = *DAG.getContext();
13367 EVT VT = N->getValueType(0);
13368
13369 assert(VT.isScalableVector() && "Expected a scalable vector.");
13370
13371 // Current lowering only supports the SVE-ACLE types.
13373 return SDValue();
13374
13375 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
13376 unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8;
13377 EVT ByteVT =
13379
13380 // Convert everything to the domain of EXT (i.e bytes).
13381 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
13382 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
13383 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
13384 DAG.getConstant(ElemSize, dl, MVT::i32));
13385
13386 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
13387 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
13388}
13389
13392 SelectionDAG &DAG) {
13393 if (DCI.isBeforeLegalize())
13394 return SDValue();
13395
13396 SDValue Comparator = N->getOperand(3);
13397 if (Comparator.getOpcode() == AArch64ISD::DUP ||
13398 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
13399 unsigned IID = getIntrinsicID(N);
13400 EVT VT = N->getValueType(0);
13401 EVT CmpVT = N->getOperand(2).getValueType();
13402 SDValue Pred = N->getOperand(1);
13403 SDValue Imm;
13404 SDLoc DL(N);
13405
13406 switch (IID) {
13407 default:
13408 llvm_unreachable("Called with wrong intrinsic!");
13409 break;
13410
13411 // Signed comparisons
13412 case Intrinsic::aarch64_sve_cmpeq_wide:
13413 case Intrinsic::aarch64_sve_cmpne_wide:
13414 case Intrinsic::aarch64_sve_cmpge_wide:
13415 case Intrinsic::aarch64_sve_cmpgt_wide:
13416 case Intrinsic::aarch64_sve_cmplt_wide:
13417 case Intrinsic::aarch64_sve_cmple_wide: {
13418 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
13419 int64_t ImmVal = CN->getSExtValue();
13420 if (ImmVal >= -16 && ImmVal <= 15)
13421 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
13422 else
13423 return SDValue();
13424 }
13425 break;
13426 }
13427 // Unsigned comparisons
13428 case Intrinsic::aarch64_sve_cmphs_wide:
13429 case Intrinsic::aarch64_sve_cmphi_wide:
13430 case Intrinsic::aarch64_sve_cmplo_wide:
13431 case Intrinsic::aarch64_sve_cmpls_wide: {
13432 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
13433 uint64_t ImmVal = CN->getZExtValue();
13434 if (ImmVal <= 127)
13435 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
13436 else
13437 return SDValue();
13438 }
13439 break;
13440 }
13441 }
13442
13443 if (!Imm)
13444 return SDValue();
13445
13446 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
13447 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
13448 N->getOperand(2), Splat, DAG.getCondCode(CC));
13449 }
13450
13451 return SDValue();
13452}
13453
13456 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13457
13458 SDLoc DL(Op);
13459 assert(Op.getValueType().isScalableVector() &&
13460 TLI.isTypeLegal(Op.getValueType()) &&
13461 "Expected legal scalable vector type!");
13462
13463 // Ensure target specific opcodes are using legal type.
13464 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
13465 SDValue TVal = DAG.getConstant(1, DL, OutVT);
13466 SDValue FVal = DAG.getConstant(0, DL, OutVT);
13467
13468 // Set condition code (CC) flags.
13470
13471 // Convert CC to integer based on requested condition.
13472 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
13473 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
13474 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
13475 return DAG.getZExtOrTrunc(Res, DL, VT);
13476}
13477
13479 SelectionDAG &DAG) {
13480 SDLoc DL(N);
13481
13482 SDValue Pred = N->getOperand(1);
13483 SDValue VecToReduce = N->getOperand(2);
13484
13485 // NOTE: The integer reduction's result type is not always linked to the
13486 // operand's element type so we construct it from the intrinsic's result type.
13487 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
13488 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
13489
13490 // SVE reductions set the whole vector register with the first element
13491 // containing the reduction result, which we'll now extract.
13492 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
13493 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
13494 Zero);
13495}
13496
13498 SelectionDAG &DAG) {
13499 SDLoc DL(N);
13500
13501 SDValue Pred = N->getOperand(1);
13502 SDValue VecToReduce = N->getOperand(2);
13503
13504 EVT ReduceVT = VecToReduce.getValueType();
13505 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
13506
13507 // SVE reductions set the whole vector register with the first element
13508 // containing the reduction result, which we'll now extract.
13509 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
13510 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
13511 Zero);
13512}
13513
13515 SelectionDAG &DAG) {
13516 SDLoc DL(N);
13517
13518 SDValue Pred = N->getOperand(1);
13519 SDValue InitVal = N->getOperand(2);
13520 SDValue VecToReduce = N->getOperand(3);
13521 EVT ReduceVT = VecToReduce.getValueType();
13522
13523 // Ordered reductions use the first lane of the result vector as the
13524 // reduction's initial value.
13525 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
13527 DAG.getUNDEF(ReduceVT), InitVal, Zero);
13528
13529 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
13530
13531 // SVE reductions set the whole vector register with the first element
13532 // containing the reduction result, which we'll now extract.
13533 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
13534 Zero);
13535}
13536
13537// If a merged operation has no inactive lanes we can relax it to a predicated
13538// or unpredicated operation, which potentially allows better isel (perhaps
13539// using immediate forms) or relaxing register reuse requirements.
13541 SelectionDAG &DAG) {
13542 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
13543 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
13544 SDValue Pg = N->getOperand(1);
13545
13546 // ISD way to specify an all active predicate.
13547 if ((Pg.getOpcode() == AArch64ISD::PTRUE) &&
13548 (Pg.getConstantOperandVal(0) == AArch64SVEPredPattern::all))
13549 return DAG.getNode(PredOpc, SDLoc(N), N->getValueType(0), Pg,
13550 N->getOperand(2), N->getOperand(3));
13551
13552 // FUTURE: SplatVector(true)
13553 return SDValue();
13554}
13555
13558 const AArch64Subtarget *Subtarget) {
13559 SelectionDAG &DAG = DCI.DAG;
13560 unsigned IID = getIntrinsicID(N);
13561 switch (IID) {
13562 default:
13563 break;
13564 case Intrinsic::aarch64_neon_vcvtfxs2fp:
13565 case Intrinsic::aarch64_neon_vcvtfxu2fp:
13566 return tryCombineFixedPointConvert(N, DCI, DAG);
13567 case Intrinsic::aarch64_neon_saddv:
13569 case Intrinsic::aarch64_neon_uaddv:
13571 case Intrinsic::aarch64_neon_sminv:
13573 case Intrinsic::aarch64_neon_uminv:
13575 case Intrinsic::aarch64_neon_smaxv:
13577 case Intrinsic::aarch64_neon_umaxv:
13579 case Intrinsic::aarch64_neon_fmax:
13580 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
13581 N->getOperand(1), N->getOperand(2));
13582 case Intrinsic::aarch64_neon_fmin:
13583 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
13584 N->getOperand(1), N->getOperand(2));
13585 case Intrinsic::aarch64_neon_fmaxnm:
13586 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
13587 N->getOperand(1), N->getOperand(2));
13588 case Intrinsic::aarch64_neon_fminnm:
13589 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
13590 N->getOperand(1), N->getOperand(2));
13591 case Intrinsic::aarch64_neon_smull:
13592 case Intrinsic::aarch64_neon_umull:
13593 case Intrinsic::aarch64_neon_pmull:
13594 case Intrinsic::aarch64_neon_sqdmull:
13595 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
13596 case Intrinsic::aarch64_neon_sqshl:
13597 case Intrinsic::aarch64_neon_uqshl:
13598 case Intrinsic::aarch64_neon_sqshlu:
13599 case Intrinsic::aarch64_neon_srshl:
13600 case Intrinsic::aarch64_neon_urshl:
13601 case Intrinsic::aarch64_neon_sshl:
13602 case Intrinsic::aarch64_neon_ushl:
13603 return tryCombineShiftImm(IID, N, DAG);
13604 case Intrinsic::aarch64_crc32b:
13605 case Intrinsic::aarch64_crc32cb:
13606 return tryCombineCRC32(0xff, N, DAG);
13607 case Intrinsic::aarch64_crc32h:
13608 case Intrinsic::aarch64_crc32ch:
13609 return tryCombineCRC32(0xffff, N, DAG);
13610 case Intrinsic::aarch64_sve_saddv:
13611 // There is no i64 version of SADDV because the sign is irrelevant.
13612 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
13614 else
13616 case Intrinsic::aarch64_sve_uaddv:
13618 case Intrinsic::aarch64_sve_smaxv:
13620 case Intrinsic::aarch64_sve_umaxv:
13622 case Intrinsic::aarch64_sve_sminv:
13624 case Intrinsic::aarch64_sve_uminv:
13626 case Intrinsic::aarch64_sve_orv:
13628 case Intrinsic::aarch64_sve_eorv:
13630 case Intrinsic::aarch64_sve_andv:
13632 case Intrinsic::aarch64_sve_index:
13633 return LowerSVEIntrinsicIndex(N, DAG);
13634 case Intrinsic::aarch64_sve_dup:
13635 return LowerSVEIntrinsicDUP(N, DAG);
13636 case Intrinsic::aarch64_sve_dup_x:
13637 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
13638 N->getOperand(1));
13639 case Intrinsic::aarch64_sve_ext:
13640 return LowerSVEIntrinsicEXT(N, DAG);
13641 case Intrinsic::aarch64_sve_smin:
13643 case Intrinsic::aarch64_sve_umin:
13645 case Intrinsic::aarch64_sve_smax:
13647 case Intrinsic::aarch64_sve_umax:
13649 case Intrinsic::aarch64_sve_lsl:
13651 case Intrinsic::aarch64_sve_lsr:
13653 case Intrinsic::aarch64_sve_asr:
13655 case Intrinsic::aarch64_sve_cmphs:
13656 if (!N->getOperand(2).getValueType().isFloatingPoint())
13658 N->getValueType(0), N->getOperand(1), N->getOperand(2),
13659 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
13660 break;
13661 case Intrinsic::aarch64_sve_cmphi:
13662 if (!N->getOperand(2).getValueType().isFloatingPoint())
13664 N->getValueType(0), N->getOperand(1), N->getOperand(2),
13665 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
13666 break;
13667 case Intrinsic::aarch64_sve_cmpge:
13668 if (!N->getOperand(2).getValueType().isFloatingPoint())
13670 N->getValueType(0), N->getOperand(1), N->getOperand(2),
13671 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
13672 break;
13673 case Intrinsic::aarch64_sve_cmpgt:
13674 if (!N->getOperand(2).getValueType().isFloatingPoint())
13676 N->getValueType(0), N->getOperand(1), N->getOperand(2),
13677 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
13678 break;
13679 case Intrinsic::aarch64_sve_cmpeq:
13680 if (!N->getOperand(2).getValueType().isFloatingPoint())
13682 N->getValueType(0), N->getOperand(1), N->getOperand(2),
13683 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
13684 break;
13685 case Intrinsic::aarch64_sve_cmpne:
13686 if (!N->getOperand(2).getValueType().isFloatingPoint())
13688 N->getValueType(0), N->getOperand(1), N->getOperand(2),
13689 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
13690 break;
13691 case Intrinsic::aarch64_sve_fadda:
13693 case Intrinsic::aarch64_sve_faddv:
13695 case Intrinsic::aarch64_sve_fmaxnmv:
13697 case Intrinsic::aarch64_sve_fmaxv:
13699 case Intrinsic::aarch64_sve_fminnmv:
13701 case Intrinsic::aarch64_sve_fminv:
13703 case Intrinsic::aarch64_sve_sel:
13704 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
13705 N->getOperand(1), N->getOperand(2), N->getOperand(3));
13706 case Intrinsic::aarch64_sve_cmpeq_wide:
13708 case Intrinsic::aarch64_sve_cmpne_wide:
13710 case Intrinsic::aarch64_sve_cmpge_wide:
13712 case Intrinsic::aarch64_sve_cmpgt_wide:
13714 case Intrinsic::aarch64_sve_cmplt_wide:
13716 case Intrinsic::aarch64_sve_cmple_wide:
13718 case Intrinsic::aarch64_sve_cmphs_wide:
13720 case Intrinsic::aarch64_sve_cmphi_wide:
13722 case Intrinsic::aarch64_sve_cmplo_wide:
13724 case Intrinsic::aarch64_sve_cmpls_wide:
13726 case Intrinsic::aarch64_sve_ptest_any:
13727 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
13729 case Intrinsic::aarch64_sve_ptest_first:
13730 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
13732 case Intrinsic::aarch64_sve_ptest_last:
13733 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
13735 }
13736 return SDValue();
13737}
13738
13741 SelectionDAG &DAG) {
13742 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
13743 // we can convert that DUP into another extract_high (of a bigger DUP), which
13744 // helps the backend to decide that an sabdl2 would be useful, saving a real
13745 // extract_high operation.
13746 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
13747 (N->getOperand(0).getOpcode() == AArch64ISD::UABD ||
13748 N->getOperand(0).getOpcode() == AArch64ISD::SABD)) {
13749 SDNode *ABDNode = N->getOperand(0).getNode();
13750 SDValue NewABD =
13752 if (!NewABD.getNode())
13753 return SDValue();
13754
13755 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
13756 }
13757
13758 // This is effectively a custom type legalization for AArch64.
13759 //
13760 // Type legalization will split an extend of a small, legal, type to a larger
13761 // illegal type by first splitting the destination type, often creating
13762 // illegal source types, which then get legalized in isel-confusing ways,
13763 // leading to really terrible codegen. E.g.,
13764 // %result = v8i32 sext v8i8 %value
13765 // becomes
13766 // %losrc = extract_subreg %value, ...
13767 // %hisrc = extract_subreg %value, ...
13768 // %lo = v4i32 sext v4i8 %losrc
13769 // %hi = v4i32 sext v4i8 %hisrc
13770 // Things go rapidly downhill from there.
13771 //
13772 // For AArch64, the [sz]ext vector instructions can only go up one element
13773 // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
13774 // take two instructions.
13775 //
13776 // This implies that the most efficient way to do the extend from v8i8
13777 // to two v4i32 values is to first extend the v8i8 to v8i16, then do
13778 // the normal splitting to happen for the v8i16->v8i32.
13779
13780 // This is pre-legalization to catch some cases where the default
13781 // type legalization will create ill-tempered code.
13782 if (!DCI.isBeforeLegalizeOps())
13783 return SDValue();
13784
13785 // We're only interested in cleaning things up for non-legal vector types
13786 // here. If both the source and destination are legal, things will just
13787 // work naturally without any fiddling.
13788 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13789 EVT ResVT = N->getValueType(0);
13790 if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
13791 return SDValue();
13792 // If the vector type isn't a simple VT, it's beyond the scope of what
13793 // we're worried about here. Let legalization do its thing and hope for
13794 // the best.
13795 SDValue Src = N->getOperand(0);
13796 EVT SrcVT = Src->getValueType(0);
13797 if (!ResVT.isSimple() || !SrcVT.isSimple())
13798 return SDValue();
13799
13800 // If the source VT is a 64-bit fixed or scalable vector, we can play games
13801 // and get the better results we want.
13802 if (SrcVT.getSizeInBits().getKnownMinSize() != 64)
13803 return SDValue();
13804
13805 unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
13806 ElementCount SrcEC = SrcVT.getVectorElementCount();
13808 SDLoc DL(N);
13809 Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
13810
13811 // Now split the rest of the operation into two halves, each with a 64
13812 // bit source.
13813 EVT LoVT, HiVT;
13814 SDValue Lo, Hi;
13815 LoVT = HiVT = ResVT.getHalfNumVectorElementsVT(*DAG.getContext());
13816
13817 EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
13818 LoVT.getVectorElementCount());
13819 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
13820 DAG.getConstant(0, DL, MVT::i64));
13821 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
13822 DAG.getConstant(InNVT.getVectorMinNumElements(), DL, MVT::i64));
13823 Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
13824 Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
13825
13826 // Now combine the parts back together so we still have a single result
13827 // like the combiner expects.
13828 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
13829}
13830
13832 SDValue SplatVal, unsigned NumVecElts) {
13833 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
13834 unsigned OrigAlignment = St.getAlignment();
13835 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
13836
13837 // Create scalar stores. This is at least as good as the code sequence for a
13838 // split unaligned store which is a dup.s, ext.b, and two stores.
13839 // Most of the time the three stores should be replaced by store pair
13840 // instructions (stp).
13841 SDLoc DL(&St);
13842 SDValue BasePtr = St.getBasePtr();
13843 uint64_t BaseOffset = 0;
13844
13845 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
13846 SDValue NewST1 =
13847 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
13848 OrigAlignment, St.getMemOperand()->getFlags());
13849
13850 // As this in ISel, we will not merge this add which may degrade results.
13851 if (BasePtr->getOpcode() == ISD::ADD &&
13852 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
13853 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
13854 BasePtr = BasePtr->getOperand(0);
13855 }
13856
13857 unsigned Offset = EltOffset;
13858 while (--NumVecElts) {
13859 unsigned Alignment = MinAlign(OrigAlignment, Offset);
13860 SDValue OffsetPtr =
13861 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
13862 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
13863 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
13864 PtrInfo.getWithOffset(Offset), Alignment,
13865 St.getMemOperand()->getFlags());
13866 Offset += EltOffset;
13867 }
13868 return NewST1;
13869}
13870
13871// Returns an SVE type that ContentTy can be trivially sign or zero extended
13872// into.
13874 assert(ContentTy.isSimple() && "No SVE containers for extended types");
13875
13876 switch (ContentTy.getSimpleVT().SimpleTy) {
13877 default:
13878 llvm_unreachable("No known SVE container for this MVT type");
13879 case MVT::nxv2i8:
13880 case MVT::nxv2i16:
13881 case MVT::nxv2i32:
13882 case MVT::nxv2i64:
13883 case MVT::nxv2f32:
13884 case MVT::nxv2f64:
13885 return MVT::nxv2i64;
13886 case MVT::nxv4i8:
13887 case MVT::nxv4i16:
13888 case MVT::nxv4i32:
13889 case MVT::nxv4f32:
13890 return MVT::nxv4i32;
13891 case MVT::nxv8i8:
13892 case MVT::nxv8i16:
13893 case MVT::nxv8f16:
13894 case MVT::nxv8bf16:
13895 return MVT::nxv8i16;
13896 case MVT::nxv16i8:
13897 return MVT::nxv16i8;
13898 }
13899}
13900
13901static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
13902 SDLoc DL(N);
13903 EVT VT = N->getValueType(0);
13904
13906 return SDValue();
13907
13908 EVT ContainerVT = VT;
13909 if (ContainerVT.isInteger())
13911
13913 SDValue Ops[] = { N->getOperand(0), // Chain
13914 N->getOperand(2), // Pg
13915 N->getOperand(3), // Base
13916 DAG.getValueType(VT) };
13917
13918 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
13919 SDValue LoadChain = SDValue(Load.getNode(), 1);
13920
13921 if (ContainerVT.isInteger() && (VT != ContainerVT))
13922 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
13923
13924 return DAG.getMergeValues({ Load, LoadChain }, DL);
13925}
13926
13928 SDLoc DL(N);
13929 EVT VT = N->getValueType(0);
13930 EVT PtrTy = N->getOperand(3).getValueType();
13931
13932 if (VT == MVT::nxv8bf16 &&
13933 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
13934 return SDValue();
13935
13936 EVT LoadVT = VT;
13937 if (VT.isFloatingPoint())
13939
13942 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
13943 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
13944 MINode->getOperand(2), PassThru,
13945 MINode->getMemoryVT(), MINode->getMemOperand(),
13947
13948 if (VT.isFloatingPoint()) {
13949 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
13950 return DAG.getMergeValues(Ops, DL);
13951 }
13952
13953 return L;
13954}
13955
13956template <unsigned Opcode>
13958 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
13960 "Unsupported opcode.");
13961 SDLoc DL(N);
13962 EVT VT = N->getValueType(0);
13963 if (VT == MVT::nxv8bf16 &&
13964 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
13965 return SDValue();
13966
13967 EVT LoadVT = VT;
13968 if (VT.isFloatingPoint())
13970
13971 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
13972 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
13973 SDValue LoadChain = SDValue(Load.getNode(), 1);
13974
13975 if (VT.isFloatingPoint())
13976 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
13977
13978 return DAG.getMergeValues({Load, LoadChain}, DL);
13979}
13980
13982 SDLoc DL(N);
13983 SDValue Data = N->getOperand(2);
13984 EVT DataVT = Data.getValueType();
13987
13988 if (DataVT == MVT::nxv8bf16 &&
13989 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
13990 return SDValue();
13991
13992 if (DataVT.isFloatingPoint())
13994
13996 if (Data.getValueType().isFloatingPoint())
13998 else
14000
14001 SDValue Ops[] = { N->getOperand(0), // Chain
14002 SrcNew,
14003 N->getOperand(4), // Base
14004 N->getOperand(3), // Pg
14005 InputVT
14006 };
14007
14008 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
14009}
14010
14012 SDLoc DL(N);
14013
14014 SDValue Data = N->getOperand(2);
14015 EVT DataVT = Data.getValueType();
14016 EVT PtrTy = N->getOperand(4).getValueType();
14017
14018 if (DataVT == MVT::nxv8bf16 &&
14019 !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
14020 return SDValue();
14021
14022 if (DataVT.isFloatingPoint())
14023 Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
14024
14026 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
14027 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
14028 MINode->getMemoryVT(), MINode->getMemOperand(),
14029 ISD::UNINDEXED, false, false);
14030}
14031
14032/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
14033/// load store optimizer pass will merge them to store pair stores. This should
14034/// be better than a movi to create the vector zero followed by a vector store
14035/// if the zero constant is not re-used, since one instructions and one register
14036/// live range will be removed.
14037///
14038/// For example, the final generated code should be:
14039///
14040/// stp xzr, xzr, [x0]
14041///
14042/// instead of:
14043///
14044/// movi v0.2d, #0
14045/// str q0, [x0]
14046///
14048 SDValue StVal = St.getValue();
14049 EVT VT = StVal.getValueType();
14050
14051 // Avoid scalarizing zero splat stores for scalable vectors.
14052 if (VT.isScalableVector())
14053 return SDValue();
14054
14055 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
14056 // 2, 3 or 4 i32 elements.
14058 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
14059 VT.getVectorElementType().getSizeInBits() == 64) ||
14060 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
14061 VT.getVectorElementType().getSizeInBits() == 32)))
14062 return SDValue();
14063
14064 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
14065 return SDValue();
14066
14067 // If the zero constant has more than one use then the vector store could be
14068 // better since the constant mov will be amortized and stp q instructions
14069 // should be able to be formed.
14070 if (!StVal.hasOneUse())
14071 return SDValue();
14072
14073 // If the store is truncating then it's going down to i16 or smaller, which
14074 // means it can be implemented in a single store anyway.
14075 if (St.isTruncatingStore())
14076 return SDValue();
14077
14078 // If the immediate offset of the address operand is too large for the stp
14079 // instruction, then bail out.
14080 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
14081 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
14083 return SDValue();
14084 }
14085
14086 for (int I = 0; I < NumVecElts; ++I) {
14087 SDValue EltVal = StVal.getOperand(I);
14089 return SDValue();
14090 }
14091
14092 // Use a CopyFromReg WZR/XZR here to prevent
14093 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
14094 SDLoc DL(&St);
14095 unsigned ZeroReg;
14096 EVT ZeroVT;
14097 if (VT.getVectorElementType().getSizeInBits() == 32) {
14098 ZeroReg = AArch64::WZR;
14099 ZeroVT = MVT::i32;
14100 } else {
14101 ZeroReg = AArch64::XZR;
14102 ZeroVT = MVT::i64;
14103 }
14106 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
14107}
14108
14109/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
14110/// value. The load store optimizer pass will merge them to store pair stores.
14111/// This has better performance than a splat of the scalar followed by a split
14112/// vector store. Even if the stores are not merged it is four stores vs a dup,
14113/// followed by an ext.b and two stores.
14115 SDValue StVal = St.getValue();
14116 EVT VT = StVal.getValueType();
14117
14118 // Don't replace floating point stores, they possibly won't be transformed to
14119 // stp because of the store pair suppress pass.
14120 if (VT.isFloatingPoint())
14121 return SDValue();
14122
14123 // We can express a splat as store pair(s) for 2 or 4 elements.
14124 unsigned NumVecElts = VT.getVectorNumElements();
14125 if (NumVecElts != 4 && NumVecElts != 2)
14126 return SDValue();
14127
14128 // If the store is truncating then it's going down to i16 or smaller, which
14129 // means it can be implemented in a single store anyway.
14130 if (St.isTruncatingStore())
14131 return SDValue();
14132
14133 // Check that this is a splat.
14134 // Make sure that each of the relevant vector element locations are inserted
14135 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
14136 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
14138 for (unsigned I = 0; I < NumVecElts; ++I) {
14139 // Check for insert vector elements.
14140 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
14141 return SDValue();
14142
14143 // Check that same value is inserted at each vector element.
14144 if (I == 0)
14145 SplatVal = StVal.getOperand(1);
14146 else if (StVal.getOperand(1) != SplatVal)
14147 return SDValue();
14148
14149 // Check insert element index.
14151 if (!CIndex)
14152 return SDValue();
14153 uint64_t IndexVal = CIndex->getZExtValue();
14154 if (IndexVal >= NumVecElts)
14155 return SDValue();
14157
14158 StVal = StVal.getOperand(0);
14159 }
14160 // Check that all vector element locations were inserted to.
14161 if (IndexNotInserted.any())
14162 return SDValue();
14163
14164 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
14165}
14166
14168 SelectionDAG &DAG,
14169 const AArch64Subtarget *Subtarget) {
14170
14172 if (S->isVolatile() || S->isIndexed())
14173 return SDValue();
14174
14175 SDValue StVal = S->getValue();
14176 EVT VT = StVal.getValueType();
14177
14178 if (!VT.isFixedLengthVector())
14179 return SDValue();
14180
14181 // If we get a splat of zeros, convert this vector store to a store of
14182 // scalars. They will be merged into store pairs of xzr thereby removing one
14183 // instruction and one register.
14185 return ReplacedZeroSplat;
14186
14187 // FIXME: The logic for deciding if an unaligned store should be split should
14188 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
14189 // a call to that function here.
14190
14191 if (!Subtarget->isMisaligned128StoreSlow())
14192 return SDValue();
14193
14194 // Don't split at -Oz.
14196 return SDValue();
14197
14198 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
14199 // those up regresses performance on micro-benchmarks and olden/bh.
14200 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
14201 return SDValue();
14202
14203 // Split unaligned 16B stores. They are terrible for performance.
14204 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
14205 // extensions can use this to mark that it does not want splitting to happen
14206 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
14207 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
14208 if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
14209 S->getAlignment() <= 2)
14210 return SDValue();
14211
14212 // If we get a splat of a scalar convert this vector store to a store of
14213 // scalars. They will be merged into store pairs thereby removing two
14214 // instructions.
14216 return ReplacedSplat;
14217
14218 SDLoc DL(S);
14219
14220 // Split VT into two.
14222 unsigned NumElts = HalfVT.getVectorNumElements();
14224 DAG.getConstant(0, DL, MVT::i64));
14227 SDValue BasePtr = S->getBasePtr();
14228 SDValue NewST1 =
14229 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
14230 S->getAlignment(), S->getMemOperand()->getFlags());
14231 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
14232 DAG.getConstant(8, DL, MVT::i64));
14233 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
14234 S->getPointerInfo(), S->getAlignment(),
14235 S->getMemOperand()->getFlags());
14236}
14237
14239 SDLoc DL(N);
14240 SDValue Op0 = N->getOperand(0);
14241 SDValue Op1 = N->getOperand(1);
14242 EVT ResVT = N->getValueType(0);
14243
14244 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
14245 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
14246 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
14247 SDValue X = Op0.getOperand(0).getOperand(0);
14248 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
14249 }
14250 }
14251
14252 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
14253 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
14254 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
14255 SDValue Z = Op1.getOperand(0).getOperand(1);
14256 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
14257 }
14258 }
14259
14260 return SDValue();
14261}
14262
14263/// Target-specific DAG combine function for post-increment LD1 (lane) and
14264/// post-increment LD1R.
14267 bool IsLaneOp) {
14268 if (DCI.isBeforeLegalizeOps())
14269 return SDValue();
14270
14271 SelectionDAG &DAG = DCI.DAG;
14272 EVT VT = N->getValueType(0);
14273
14274 if (VT.isScalableVector())
14275 return SDValue();
14276
14277 unsigned LoadIdx = IsLaneOp ? 1 : 0;
14278 SDNode *LD = N->getOperand(LoadIdx).getNode();
14279 // If it is not LOAD, can not do such combine.
14280 if (LD->getOpcode() != ISD::LOAD)
14281 return SDValue();
14282
14283 // The vector lane must be a constant in the LD1LANE opcode.
14284 SDValue Lane;
14285 if (IsLaneOp) {
14286 Lane = N->getOperand(2);
14287 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
14288 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
14289 return SDValue();
14290 }
14291
14293 EVT MemVT = LoadSDN->getMemoryVT();
14294 // Check if memory operand is the same type as the vector element.
14295 if (MemVT != VT.getVectorElementType())
14296 return SDValue();
14297
14298 // Check if there are other uses. If so, do not combine as it will introduce
14299 // an extra load.
14300 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
14301 ++UI) {
14302 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
14303 continue;
14304 if (*UI != N)
14305 return SDValue();
14306 }
14307
14308 SDValue Addr = LD->getOperand(1);
14309 SDValue Vector = N->getOperand(0);
14310 // Search for a use of the address operand that is an increment.
14311 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
14312 Addr.getNode()->use_end(); UI != UE; ++UI) {
14313 SDNode *User = *UI;
14314 if (User->getOpcode() != ISD::ADD
14315 || UI.getUse().getResNo() != Addr.getResNo())
14316 continue;
14317
14318 // If the increment is a constant, it must match the memory ref size.
14319 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
14321 uint32_t IncVal = CInc->getZExtValue();
14322 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
14323 if (IncVal != NumBytes)
14324 continue;
14325 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
14326 }
14327
14328 // To avoid cycle construction make sure that neither the load nor the add
14329 // are predecessors to each other or the Vector.
14332 Visited.insert(Addr.getNode());
14333 Worklist.push_back(User);
14334 Worklist.push_back(LD);
14335 Worklist.push_back(Vector.getNode());
14336 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
14337 SDNode::hasPredecessorHelper(User, Visited, Worklist))
14338 continue;
14339
14341 Ops.push_back(LD->getOperand(0)); // Chain
14342 if (IsLaneOp) {
14343 Ops.push_back(Vector); // The vector to be inserted
14344 Ops.push_back(Lane); // The lane to be inserted in the vector
14345 }
14346 Ops.push_back(Addr);
14347 Ops.push_back(Inc);
14348
14349 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
14350 SDVTList SDTys = DAG.getVTList(Tys);
14353 MemVT,
14354 LoadSDN->getMemOperand());
14355
14356 // Update the uses.
14357 SDValue NewResults[] = {
14358 SDValue(LD, 0), // The result of load
14359 SDValue(UpdN.getNode(), 2) // Chain
14360 };
14361 DCI.CombineTo(LD, NewResults);
14362 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
14363 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
14364
14365 break;
14366 }
14367 return SDValue();
14368}
14369
14370/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
14371/// address translation.
14374 SelectionDAG &DAG) {
14376 KnownBits Known;
14377 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
14378 !DCI.isBeforeLegalizeOps());
14379 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14380 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
14381 DCI.CommitTargetLoweringOpt(TLO);
14382 return true;
14383 }
14384 return false;
14385}
14386
14389 SelectionDAG &DAG,
14390 const AArch64Subtarget *Subtarget) {
14391 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
14392 return Split;
14393
14394 if (Subtarget->supportsAddressTopByteIgnored() &&
14395 performTBISimplification(N->getOperand(2), DCI, DAG))
14396 return SDValue(N, 0);
14397
14398 return SDValue();
14399}
14400
14403 SelectionDAG &DAG) {
14405 assert(MGS && "Can only combine gather load or scatter store nodes");
14406
14407 SDLoc DL(MGS);
14408 SDValue Chain = MGS->getChain();
14409 SDValue Scale = MGS->getScale();
14410 SDValue Index = MGS->getIndex();
14411 SDValue Mask = MGS->getMask();
14412 SDValue BasePtr = MGS->getBasePtr();
14413 ISD::MemIndexType IndexType = MGS->getIndexType();
14414
14415 EVT IdxVT = Index.getValueType();
14416
14417 if (DCI.isBeforeLegalize()) {
14418 // SVE gather/scatter requires indices of i32/i64. Promote anything smaller
14419 // prior to legalisation so the result can be split if required.
14420 if ((IdxVT.getVectorElementType() == MVT::i8) ||
14421 (IdxVT.getVectorElementType() == MVT::i16)) {
14422 EVT NewIdxVT = IdxVT.changeVectorElementType(MVT::i32);
14423 if (MGS->isIndexSigned())
14424 Index = DAG.getNode(ISD::SIGN_EXTEND, DL, NewIdxVT, Index);
14425 else
14426 Index = DAG.getNode(ISD::ZERO_EXTEND, DL, NewIdxVT, Index);
14427
14428 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
14429 SDValue PassThru = MGT->getPassThru();
14430 SDValue Ops[] = { Chain, PassThru, Mask, BasePtr, Index, Scale };
14431 return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
14432 PassThru.getValueType(), DL, Ops,
14433 MGT->getMemOperand(),
14434 MGT->getIndexType(), MGT->getExtensionType());
14435 } else {
14437 SDValue Data = MSC->getValue();
14438 SDValue Ops[] = { Chain, Data, Mask, BasePtr, Index, Scale };
14439 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
14440 MSC->getMemoryVT(), DL, Ops,
14441 MSC->getMemOperand(), IndexType,
14442 MSC->isTruncatingStore());
14443 }
14444 }
14445 }
14446
14447 return SDValue();
14448}
14449
14450/// Target-specific DAG combine function for NEON load/store intrinsics
14451/// to merge base address updates.
14454 SelectionDAG &DAG) {
14455 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14456 return SDValue();
14457
14458 unsigned AddrOpIdx = N->getNumOperands() - 1;
14459 SDValue Addr = N->getOperand(AddrOpIdx);
14460
14461 // Search for a use of the address operand that is an increment.
14462 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
14463 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
14464 SDNode *User = *UI;
14465 if (User->getOpcode() != ISD::ADD ||
14466 UI.getUse().getResNo() != Addr.getResNo())
14467 continue;
14468
14469 // Check that the add is independent of the load/store. Otherwise, folding
14470 // it would create a cycle.
14473 Visited.insert(Addr.getNode());
14474 Worklist.push_back(N);
14475 Worklist.push_back(User);
14476 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
14477 SDNode::hasPredecessorHelper(User, Visited, Worklist))
14478 continue;
14479
14480 // Find the new opcode for the updating load/store.
14481 bool IsStore = false;
14482 bool IsLaneOp = false;
14483 bool IsDupOp = false;
14484 unsigned NewOpc = 0;
14485 unsigned NumVecs = 0;
14486 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
14487 switch (IntNo) {
14488 default: llvm_unreachable("unexpected intrinsic for Neon base update");
14489 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
14490 NumVecs = 2; break;
14491 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
14492 NumVecs = 3; break;
14493 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
14494 NumVecs = 4; break;
14495 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
14496 NumVecs = 2; IsStore = true; break;
14497 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
14498 NumVecs = 3; IsStore = true; break;
14499 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
14500 NumVecs = 4; IsStore = true; break;
14501 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
14502 NumVecs = 2; break;
14503 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
14504 NumVecs = 3; break;
14505 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
14506 NumVecs = 4; break;
14507 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
14508 NumVecs = 2; IsStore = true; break;
14509 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
14510 NumVecs = 3; IsStore = true; break;
14511 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
14512 NumVecs = 4; IsStore = true; break;
14513 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
14514 NumVecs = 2; IsDupOp = true; break;
14515 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
14516 NumVecs = 3; IsDupOp = true; break;
14517 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
14518 NumVecs = 4; IsDupOp = true; break;
14519 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
14520 NumVecs = 2; IsLaneOp = true; break;
14521 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
14522 NumVecs = 3; IsLaneOp = true; break;
14523 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
14524 NumVecs = 4; IsLaneOp = true; break;
14525 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
14526 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
14527 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
14528 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
14529 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
14530 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
14531 }
14532
14533 EVT VecTy;
14534 if (IsStore)
14535 VecTy = N->getOperand(2).getValueType();
14536 else
14537 VecTy = N->getValueType(0);
14538
14539 // If the increment is a constant, it must match the memory ref size.
14540 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
14542 uint32_t IncVal = CInc->getZExtValue();
14543 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
14544 if (IsLaneOp || IsDupOp)
14545 NumBytes /= VecTy.getVectorNumElements();
14546 if (IncVal != NumBytes)
14547 continue;
14548 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
14549 }
14551 Ops.push_back(N->getOperand(0)); // Incoming chain
14552 // Load lane and store have vector list as input.
14553 if (IsLaneOp || IsStore)
14554 for (unsigned i = 2; i < AddrOpIdx; ++i)
14555 Ops.push_back(N->getOperand(i));
14556 Ops.push_back(Addr); // Base register
14557 Ops.push_back(Inc);
14558
14559 // Return Types.
14560 EVT Tys[6];
14561 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
14562 unsigned n;
14563 for (n = 0; n < NumResultVecs; ++n)
14564 Tys[n] = VecTy;
14565 Tys[n++] = MVT::i64; // Type of write back register
14566 Tys[n] = MVT::Other; // Type of the chain
14568
14571 MemInt->getMemoryVT(),
14572 MemInt->getMemOperand());
14573
14574 // Update the uses.
14575 std::vector<SDValue> NewResults;
14576 for (unsigned i = 0; i < NumResultVecs; ++i) {
14577 NewResults.push_back(SDValue(UpdN.getNode(), i));
14578 }
14579 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
14580 DCI.CombineTo(N, NewResults);
14581 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
14582
14583 break;
14584 }
14585 return SDValue();
14586}
14587
14588// Checks to see if the value is the prescribed width and returns information
14589// about its extension mode.
14590static
14591bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
14592 ExtType = ISD::NON_EXTLOAD;
14593 switch(V.getNode()->getOpcode()) {
14594 default:
14595 return false;
14596 case ISD::LOAD: {
14597 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
14598 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
14599 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
14600 ExtType = LoadNode->getExtensionType();
14601 return true;
14602 }
14603 return false;
14604 }
14605 case ISD::AssertSext: {
14606 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
14607 if ((TypeNode->getVT() == MVT::i8 && width == 8)
14608 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
14609 ExtType = ISD::SEXTLOAD;
14610 return true;
14611 }
14612 return false;
14613 }
14614 case ISD::AssertZext: {
14615 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
14616 if ((TypeNode->getVT() == MVT::i8 && width == 8)
14617 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
14618 ExtType = ISD::ZEXTLOAD;
14619 return true;
14620 }
14621 return false;
14622 }
14623 case ISD::Constant:
14624 case ISD::TargetConstant: {
14625 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
14626 1LL << (width - 1);
14627 }
14628 }
14629
14630 return true;
14631}
14632
14633// This function does a whole lot of voodoo to determine if the tests are
14634// equivalent without and with a mask. Essentially what happens is that given a
14635// DAG resembling:
14636//
14637// +-------------+ +-------------+ +-------------+ +-------------+
14638// | Input | | AddConstant | | CompConstant| | CC |
14639// +-------------+ +-------------+ +-------------+ +-------------+
14640// | | | |
14641// V V | +----------+
14642// +-------------+ +----+ | |
14643// | ADD | |0xff| | |
14644// +-------------+ +----+ | |
14645// | | | |
14646// V V | |
14647// +-------------+ | |
14648// | AND | | |
14649// +-------------+ | |
14650// | | |
14651// +-----+ | |
14652// | | |
14653// V V V
14654// +-------------+
14655// | CMP |
14656// +-------------+
14657//
14658// The AND node may be safely removed for some combinations of inputs. In
14659// particular we need to take into account the extension type of the Input,
14660// the exact values of AddConstant, CompConstant, and CC, along with the nominal
14661// width of the input (this can work for any width inputs, the above graph is
14662// specific to 8 bits.
14663//
14664// The specific equations were worked out by generating output tables for each
14665// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
14666// problem was simplified by working with 4 bit inputs, which means we only
14667// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
14668// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
14669// patterns present in both extensions (0,7). For every distinct set of
14670// AddConstant and CompConstants bit patterns we can consider the masked and
14671// unmasked versions to be equivalent if the result of this function is true for
14672// all 16 distinct bit patterns of for the current extension type of Input (w0).
14673//
14674// sub w8, w0, w1
14675// and w10, w8, #0x0f
14676// cmp w8, w2
14677// cset w9, AArch64CC
14678// cmp w10, w2
14679// cset w11, AArch64CC
14680// cmp w9, w11
14681// cset w0, eq
14682// ret
14683//
14684// Since the above function shows when the outputs are equivalent it defines
14685// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
14686// would be expensive to run during compiles. The equations below were written
14687// in a test harness that confirmed they gave equivalent outputs to the above
14688// for all inputs function, so they can be used determine if the removal is
14689// legal instead.
14690//
14691// isEquivalentMaskless() is the code for testing if the AND can be removed
14692// factored out of the DAG recognition as the DAG can take several forms.
14693
14694static bool isEquivalentMaskless(unsigned CC, unsigned width,
14695 ISD::LoadExtType ExtType, int AddConstant,
14696 int CompConstant) {
14697 // By being careful about our equations and only writing the in term
14698 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
14699 // make them generally applicable to all bit widths.
14700 int MaxUInt = (1 << width);
14701
14702 // For the purposes of these comparisons sign extending the type is
14703 // equivalent to zero extending the add and displacing it by half the integer
14704 // width. Provided we are careful and make sure our equations are valid over
14705 // the whole range we can just adjust the input and avoid writing equations
14706 // for sign extended inputs.
14707 if (ExtType == ISD::SEXTLOAD)
14708 AddConstant -= (1 << (width-1));
14709
14710 switch(CC) {
14711 case AArch64CC::LE:
14712 case AArch64CC::GT:
14713 if ((AddConstant == 0) ||
14714 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
14715 (AddConstant >= 0 && CompConstant < 0) ||
14717 return true;
14718 break;
14719 case AArch64CC::LT:
14720 case AArch64CC::GE:
14721 if ((AddConstant == 0) ||
14722 (AddConstant >= 0 && CompConstant <= 0) ||
14723 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
14724 return true;
14725 break;
14726 case AArch64CC::HI:
14727 case AArch64CC::LS:
14728 if ((AddConstant >= 0 && CompConstant < 0) ||
14731 return true;
14732 break;
14733 case AArch64CC::PL:
14734 case AArch64CC::MI:
14735 if ((AddConstant == 0) ||
14736 (AddConstant > 0 && CompConstant <= 0) ||
14738 return true;
14739 break;
14740 case AArch64CC::LO:
14741 case AArch64CC::HS:
14742 if ((AddConstant >= 0 && CompConstant <= 0) ||
14745 return true;
14746 break;
14747 case AArch64CC::EQ:
14748 case AArch64CC::NE:
14749 if ((AddConstant > 0 && CompConstant < 0) ||
14752 (AddConstant >= 0 && CompConstant >= 0 &&
14755 return true;
14756 break;
14757 case AArch64CC::VS:
14758 case AArch64CC::VC:
14759 case AArch64CC::AL:
14760 case AArch64CC::NV:
14761 return true;
14762 case AArch64CC::Invalid:
14763 break;
14764 }
14765
14766 return false;
14767}
14768
14769static
14772 SelectionDAG &DAG, unsigned CCIndex,
14773 unsigned CmpIndex) {
14774 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
14775 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
14776 unsigned CondOpcode = SubsNode->getOpcode();
14777
14779 return SDValue();
14780
14781 // There is a SUBS feeding this condition. Is it fed by a mask we can
14782 // use?
14783
14784 SDNode *AndNode = SubsNode->getOperand(0).getNode();
14785 unsigned MaskBits = 0;
14786
14787 if (AndNode->getOpcode() != ISD::AND)
14788 return SDValue();
14789
14790 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
14791 uint32_t CNV = CN->getZExtValue();
14792 if (CNV == 255)
14793 MaskBits = 8;
14794 else if (CNV == 65535)
14795 MaskBits = 16;
14796 }
14797
14798 if (!MaskBits)
14799 return SDValue();
14800
14801 SDValue AddValue = AndNode->getOperand(0);
14802
14803 if (AddValue.getOpcode() != ISD::ADD)
14804 return SDValue();
14805
14806 // The basic dag structure is correct, grab the inputs and validate them.
14807
14808 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
14809 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
14810 SDValue SubsInputValue = SubsNode->getOperand(1);
14811
14812 // The mask is present and the provenance of all the values is a smaller type,
14813 // lets see if the mask is superfluous.
14814
14815 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
14817 return SDValue();
14818
14819 ISD::LoadExtType ExtType;
14820
14821 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
14824 return SDValue();
14825
14826 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
14827 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
14828 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
14829 return SDValue();
14830
14831 // The AND is not necessary, remove it.
14832
14833 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
14834 SubsNode->getValueType(1));
14835 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
14836
14837 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
14838 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
14839
14840 return SDValue(N, 0);
14841}
14842
14843// Optimize compare with zero and branch.
14846 SelectionDAG &DAG) {
14848 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
14849 // will not be produced, as they are conditional branch instructions that do
14850 // not set flags.
14851 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
14852 return SDValue();
14853
14854 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
14855 N = NV.getNode();
14856 SDValue Chain = N->getOperand(0);
14857 SDValue Dest = N->getOperand(1);
14858 SDValue CCVal = N->getOperand(2);
14859 SDValue Cmp = N->getOperand(3);
14860
14861 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
14862 unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
14863 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
14864 return SDValue();
14865
14866 unsigned CmpOpc = Cmp.getOpcode();
14868 return SDValue();
14869
14870 // Only attempt folding if there is only one use of the flag and no use of the
14871 // value.
14872 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
14873 return SDValue();
14874
14875 SDValue LHS = Cmp.getOperand(0);
14876 SDValue RHS = Cmp.getOperand(1);
14877
14878 assert(LHS.getValueType() == RHS.getValueType() &&
14879 "Expected the value type to be the same for both operands!");
14880 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
14881 return SDValue();
14882
14883 if (isNullConstant(LHS))
14884 std::swap(LHS, RHS);
14885
14886 if (!isNullConstant(RHS))
14887 return SDValue();
14888
14889 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
14890 LHS.getOpcode() == ISD::SRL)
14891 return SDValue();
14892
14893 // Fold the compare into the branch instruction.
14894 SDValue BR;
14895 if (CC == AArch64CC::EQ)
14896 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
14897 else
14898 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
14899
14900 // Do not add new nodes to DAG combiner worklist.
14901 DCI.CombineTo(N, BR, false);
14902
14903 return SDValue();
14904}
14905
14906// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
14907// as well as whether the test should be inverted. This code is required to
14908// catch these cases (as opposed to standard dag combines) because
14909// AArch64ISD::TBZ is matched during legalization.
14910static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
14911 SelectionDAG &DAG) {
14912
14913 if (!Op->hasOneUse())
14914 return Op;
14915
14916 // We don't handle undef/constant-fold cases below, as they should have
14917 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
14918 // etc.)
14919
14920 // (tbz (trunc x), b) -> (tbz x, b)
14921 // This case is just here to enable more of the below cases to be caught.
14922 if (Op->getOpcode() == ISD::TRUNCATE &&
14923 Bit < Op->getValueType(0).getSizeInBits()) {
14924 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
14925 }
14926
14927 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
14928 if (Op->getOpcode() == ISD::ANY_EXTEND &&
14929 Bit < Op->getOperand(0).getValueSizeInBits()) {
14930 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
14931 }
14932
14933 if (Op->getNumOperands() != 2)
14934 return Op;
14935
14936 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14937 if (!C)
14938 return Op;
14939
14940 switch (Op->getOpcode()) {
14941 default:
14942 return Op;
14943
14944 // (tbz (and x, m), b) -> (tbz x, b)
14945 case ISD::AND:
14946 if ((C->getZExtValue() >> Bit) & 1)
14947 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
14948 return Op;
14949
14950 // (tbz (shl x, c), b) -> (tbz x, b-c)
14951 case ISD::SHL:
14952 if (C->getZExtValue() <= Bit &&
14953 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
14954 Bit = Bit - C->getZExtValue();
14955 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
14956 }
14957 return Op;
14958
14959 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
14960 case ISD::SRA:
14961 Bit = Bit + C->getZExtValue();
14962 if (Bit >= Op->getValueType(0).getSizeInBits())
14963 Bit = Op->getValueType(0).getSizeInBits() - 1;
14964 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
14965
14966 // (tbz (srl x, c), b) -> (tbz x, b+c)
14967 case ISD::SRL:
14968 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
14969 Bit = Bit + C->getZExtValue();
14970 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
14971 }
14972 return Op;
14973
14974 // (tbz (xor x, -1), b) -> (tbnz x, b)
14975 case ISD::XOR:
14976 if ((C->getZExtValue() >> Bit) & 1)
14977 Invert = !Invert;
14978 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
14979 }
14980}
14981
14982// Optimize test single bit zero/non-zero and branch.
14985 SelectionDAG &DAG) {
14986 unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
14987 bool Invert = false;
14988 SDValue TestSrc = N->getOperand(1);
14990
14991 if (TestSrc == NewTestSrc)
14992 return SDValue();
14993
14994 unsigned NewOpc = N->getOpcode();
14995 if (Invert) {
14996 if (NewOpc == AArch64ISD::TBZ)
14998 else {
15001 }
15002 }
15003
15004 SDLoc DL(N);
15005 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
15006 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
15007}
15008
15009// vselect (v1i1 setcc) ->
15010// vselect (v1iXX setcc) (XX is the size of the compared operand type)
15011// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
15012// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
15013// such VSELECT.
15015 SDValue N0 = N->getOperand(0);
15016 EVT CCVT = N0.getValueType();
15017
15018 if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
15019 CCVT.getVectorElementType() != MVT::i1)
15020 return SDValue();
15021
15022 EVT ResVT = N->getValueType(0);
15023 EVT CmpVT = N0.getOperand(0).getValueType();
15024 // Only combine when the result type is of the same size as the compared
15025 // operands.
15026 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
15027 return SDValue();
15028
15029 SDValue IfTrue = N->getOperand(1);
15030 SDValue IfFalse = N->getOperand(2);
15031 SDValue SetCC =
15032 DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
15033 N0.getOperand(0), N0.getOperand(1),
15034 cast<CondCodeSDNode>(N0.getOperand(2))->get());
15035 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
15036 IfTrue, IfFalse);
15037}
15038
15039/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
15040/// the compare-mask instructions rather than going via NZCV, even if LHS and
15041/// RHS are really scalar. This replaces any scalar setcc in the above pattern
15042/// with a vector one followed by a DUP shuffle on the result.
15045 SelectionDAG &DAG = DCI.DAG;
15046 SDValue N0 = N->getOperand(0);
15047 EVT ResVT = N->getValueType(0);
15048
15049 if (N0.getOpcode() != ISD::SETCC)
15050 return SDValue();
15051
15052 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
15053 // scalar SetCCResultType. We also don't expect vectors, because we assume
15054 // that selects fed by vector SETCCs are canonicalized to VSELECT.
15055 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
15056 "Scalar-SETCC feeding SELECT has unexpected result type!");
15057
15058 // If NumMaskElts == 0, the comparison is larger than select result. The
15059 // largest real NEON comparison is 64-bits per lane, which means the result is
15060 // at most 32-bits and an illegal vector. Just bail out for now.
15061 EVT SrcVT = N0.getOperand(0).getValueType();
15062
15063 // Don't try to do this optimization when the setcc itself has i1 operands.
15064 // There are no legal vectors of i1, so this would be pointless.
15065 if (SrcVT == MVT::i1)
15066 return SDValue();
15067
15068 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
15069 if (!ResVT.isVector() || NumMaskElts == 0)
15070 return SDValue();
15071
15073 EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
15074
15075 // Also bail out if the vector CCVT isn't the same size as ResVT.
15076 // This can happen if the SETCC operand size doesn't divide the ResVT size
15077 // (e.g., f64 vs v3f32).
15078 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
15079 return SDValue();
15080
15081 // Make sure we didn't create illegal types, if we're not supposed to.
15082 assert(DCI.isBeforeLegalize() ||
15084
15085 // First perform a vector comparison, where lane 0 is the one we're interested
15086 // in.
15087 SDLoc DL(N0);
15088 SDValue LHS =
15090 SDValue RHS =
15092 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
15093
15094 // Now duplicate the comparison mask we want across all other lanes.
15095 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
15097 Mask = DAG.getNode(ISD::BITCAST, DL,
15098 ResVT.changeVectorElementTypeToInteger(), Mask);
15099
15100 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
15101}
15102
15103/// Get rid of unnecessary NVCASTs (that don't change the type).
15105 if (N->getValueType(0) == N->getOperand(0).getValueType())
15106 return N->getOperand(0);
15107
15108 return SDValue();
15109}
15110
15111// If all users of the globaladdr are of the form (globaladdr + constant), find
15112// the smallest constant, fold it into the globaladdr's offset and rewrite the
15113// globaladdr as (globaladdr + constant) - constant.
15115 const AArch64Subtarget *Subtarget,
15116 const TargetMachine &TM) {
15118 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
15120 return SDValue();
15121
15122 uint64_t MinOffset = -1ull;
15123 for (SDNode *N : GN->uses()) {
15124 if (N->getOpcode() != ISD::ADD)
15125 return SDValue();
15126 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
15127 if (!C)
15128 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
15129 if (!C)
15130 return SDValue();
15131 MinOffset = std::min(MinOffset, C->getZExtValue());
15132 }
15133 uint64_t Offset = MinOffset + GN->getOffset();
15134
15135 // Require that the new offset is larger than the existing one. Otherwise, we
15136 // can end up oscillating between two possible DAGs, for example,
15137 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
15138 if (Offset <= uint64_t(GN->getOffset()))
15139 return SDValue();
15140
15141 // Check whether folding this offset is legal. It must not go out of bounds of
15142 // the referenced object to avoid violating the code model, and must be
15143 // smaller than 2^21 because this is the largest offset expressible in all
15144 // object formats.
15145 //
15146 // This check also prevents us from folding negative offsets, which will end
15147 // up being treated in the same way as large positive ones. They could also
15148 // cause code model violations, and aren't really common enough to matter.
15149 if (Offset >= (1 << 21))
15150 return SDValue();
15151
15152 const GlobalValue *GV = GN->getGlobal();
15153 Type *T = GV->getValueType();
15154 if (!T->isSized() ||
15156 return SDValue();
15157
15158 SDLoc DL(GN);
15159 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
15160 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
15161 DAG.getConstant(MinOffset, DL, MVT::i64));
15162}
15163
15164// Turns the vector of indices into a vector of byte offstes by scaling Offset
15165// by (BitWidth / 8).
15167 SDLoc DL, unsigned BitWidth) {
15168 assert(Offset.getValueType().isScalableVector() &&
15169 "This method is only for scalable vectors of offsets");
15170
15171 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
15173
15175}
15176
15177/// Check if the value of \p OffsetInBytes can be used as an immediate for
15178/// the gather load/prefetch and scatter store instructions with vector base and
15179/// immediate offset addressing mode:
15180///
15181/// [<Zn>.[S|D]{, #<imm>}]
15182///
15183/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
15184
15186 unsigned ScalarSizeInBytes) {
15187 // The immediate is not a multiple of the scalar size.
15189 return false;
15190
15191 // The immediate is out of range.
15193 return false;
15194
15195 return true;
15196}
15197
15198/// Check if the value of \p Offset represents a valid immediate for the SVE
15199/// gather load/prefetch and scatter store instructiona with vector base and
15200/// immediate offset addressing mode:
15201///
15202/// [<Zn>.[S|D]{, #<imm>}]
15203///
15204/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
15211
15213 unsigned Opcode,
15214 bool OnlyPackedOffsets = true) {
15215 const SDValue Src = N->getOperand(2);
15216 const EVT SrcVT = Src->getValueType(0);
15217 assert(SrcVT.isScalableVector() &&
15218 "Scatter stores are only possible for SVE vectors");
15219
15220 SDLoc DL(N);
15221 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
15222
15223 // Make sure that source data will fit into an SVE register
15224 if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
15225 return SDValue();
15226
15227 // For FPs, ACLE only supports _packed_ single and double precision types.
15228 if (SrcElVT.isFloatingPoint())
15229 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
15230 return SDValue();
15231
15232 // Depending on the addressing mode, this is either a pointer or a vector of
15233 // pointers (that fits into one register)
15234 SDValue Base = N->getOperand(4);
15235 // Depending on the addressing mode, this is either a single offset or a
15236 // vector of offsets (that fits into one register)
15237 SDValue Offset = N->getOperand(5);
15238
15239 // For "scalar + vector of indices", just scale the indices. This only
15240 // applies to non-temporal scatters because there's no instruction that takes
15241 // indicies.
15242 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
15243 Offset =
15244 getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
15245 Opcode = AArch64ISD::SSTNT1_PRED;
15246 }
15247
15248 // In the case of non-temporal gather loads there's only one SVE instruction
15249 // per data-size: "scalar + vector", i.e.
15250 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
15251 // Since we do have intrinsics that allow the arguments to be in a different
15252 // order, we may need to swap them to match the spec.
15253 if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
15254 std::swap(Base, Offset);
15255
15256 // SST1_IMM requires that the offset is an immediate that is:
15257 // * a multiple of #SizeInBytes,
15258 // * in the range [0, 31 x #SizeInBytes],
15259 // where #SizeInBytes is the size in bytes of the stored items. For
15260 // immediates outside that range and non-immediate scalar offsets use SST1 or
15261 // SST1_UXTW instead.
15262 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
15264 SrcVT.getScalarSizeInBits() / 8)) {
15267 else
15268 Opcode = AArch64ISD::SST1_PRED;
15269
15270 std::swap(Base, Offset);
15271 }
15272 }
15273
15274 auto &TLI = DAG.getTargetLoweringInfo();
15275 if (!TLI.isTypeLegal(Base.getValueType()))
15276 return SDValue();
15277
15278 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
15279 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
15280 // nxv2i64. Legalize accordingly.
15281 if (!OnlyPackedOffsets &&
15282 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
15284
15285 if (!TLI.isTypeLegal(Offset.getValueType()))
15286 return SDValue();
15287
15288 // Source value type that is representable in hardware
15290
15291 // Keep the original type of the input data to store - this is needed to be
15292 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
15293 // FP values we want the integer equivalent, so just use HwSrcVt.
15295 if (SrcVT.isFloatingPoint())
15297
15298 SDVTList VTs = DAG.getVTList(MVT::Other);
15300
15301 if (Src.getValueType().isFloatingPoint())
15302 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
15303 else
15304 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
15305
15306 SDValue Ops[] = {N->getOperand(0), // Chain
15307 SrcNew,
15308 N->getOperand(3), // Pg
15309 Base,
15310 Offset,
15311 InputVT};
15312
15313 return DAG.getNode(Opcode, DL, VTs, Ops);
15314}
15315
15317 unsigned Opcode,
15318 bool OnlyPackedOffsets = true) {
15319 const EVT RetVT = N->getValueType(0);
15320 assert(RetVT.isScalableVector() &&
15321 "Gather loads are only possible for SVE vectors");
15322
15323 SDLoc DL(N);
15324
15325 // Make sure that the loaded data will fit into an SVE register
15326 if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
15327 return SDValue();
15328
15329 // Depending on the addressing mode, this is either a pointer or a vector of
15330 // pointers (that fits into one register)
15331 SDValue Base = N->getOperand(3);
15332 // Depending on the addressing mode, this is either a single offset or a
15333 // vector of offsets (that fits into one register)
15334 SDValue Offset = N->getOperand(4);
15335
15336 // For "scalar + vector of indices", just scale the indices. This only
15337 // applies to non-temporal gathers because there's no instruction that takes
15338 // indicies.
15341 RetVT.getScalarSizeInBits());
15343 }
15344
15345 // In the case of non-temporal gather loads there's only one SVE instruction
15346 // per data-size: "scalar + vector", i.e.
15347 // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
15348 // Since we do have intrinsics that allow the arguments to be in a different
15349 // order, we may need to swap them to match the spec.
15350 if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
15351 Offset.getValueType().isVector())
15352 std::swap(Base, Offset);
15353
15354 // GLD{FF}1_IMM requires that the offset is an immediate that is:
15355 // * a multiple of #SizeInBytes,
15356 // * in the range [0, 31 x #SizeInBytes],
15357 // where #SizeInBytes is the size in bytes of the loaded items. For
15358 // immediates outside that range and non-immediate scalar offsets use
15359 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
15360 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
15363 RetVT.getScalarSizeInBits() / 8)) {
15365 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
15368 else
15369 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
15372
15373 std::swap(Base, Offset);
15374 }
15375 }
15376
15377 auto &TLI = DAG.getTargetLoweringInfo();
15378 if (!TLI.isTypeLegal(Base.getValueType()))
15379 return SDValue();
15380
15381 // Some gather load variants allow unpacked offsets, but only as nxv2i32
15382 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
15383 // nxv2i64. Legalize accordingly.
15384 if (!OnlyPackedOffsets &&
15385 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
15387
15388 // Return value type that is representable in hardware
15390
15391 // Keep the original output value type around - this is needed to be able to
15392 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
15393 // values we want the integer equivalent, so just use HwRetVT.
15395 if (RetVT.isFloatingPoint())
15396 OutVT = DAG.getValueType(HwRetVt);
15397
15399 SDValue Ops[] = {N->getOperand(0), // Chain
15400 N->getOperand(2), // Pg
15401 Base, Offset, OutVT};
15402
15403 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
15404 SDValue LoadChain = SDValue(Load.getNode(), 1);
15405
15406 if (RetVT.isInteger() && (RetVT != HwRetVt))
15407 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
15408
15409 // If the original return value was FP, bitcast accordingly. Doing it here
15410 // means that we can avoid adding TableGen patterns for FPs.
15411 if (RetVT.isFloatingPoint())
15412 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
15413
15414 return DAG.getMergeValues({Load, LoadChain}, DL);
15415}
15416
15417static SDValue
15419 SelectionDAG &DAG) {
15420 SDLoc DL(N);
15421 SDValue Src = N->getOperand(0);
15422 unsigned Opc = Src->getOpcode();
15423
15424 // Sign extend of an unsigned unpack -> signed unpack
15425 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
15426
15427 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
15429
15430 // Push the sign extend to the operand of the unpack
15431 // This is necessary where, for example, the operand of the unpack
15432 // is another unpack:
15433 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
15434 // ->
15435 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
15436 // ->
15437 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
15438 SDValue ExtOp = Src->getOperand(0);
15439 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
15440 EVT EltTy = VT.getVectorElementType();
15441 (void)EltTy;
15442
15443 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
15444 "Sign extending from an invalid type");
15445
15447
15448 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
15449 ExtOp, DAG.getValueType(ExtVT));
15450
15451 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
15452 }
15453
15454 if (DCI.isBeforeLegalizeOps())
15455 return SDValue();
15456
15458 return SDValue();
15459
15460 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
15461 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
15462 unsigned NewOpc;
15463 unsigned MemVTOpNum = 4;
15464 switch (Opc) {
15467 MemVTOpNum = 3;
15468 break;
15471 MemVTOpNum = 3;
15472 break;
15475 MemVTOpNum = 3;
15476 break;
15479 break;
15482 break;
15485 break;
15488 break;
15491 break;
15494 break;
15497 break;
15500 break;
15503 break;
15506 break;
15509 break;
15512 break;
15515 break;
15518 break;
15521 break;
15522 default:
15523 return SDValue();
15524 }
15525
15526 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
15528
15529 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
15530 return SDValue();
15531
15532 EVT DstVT = N->getValueType(0);
15533 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
15534
15536 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
15537 Ops.push_back(Src->getOperand(I));
15538
15539 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
15540 DCI.CombineTo(N, ExtLoad);
15541 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
15542
15543 // Return N so it doesn't get rechecked
15544 return SDValue(N, 0);
15545}
15546
15547/// Legalize the gather prefetch (scalar + vector addressing mode) when the
15548/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
15549/// != nxv2i32) do not need legalization.
15551 const unsigned OffsetPos = 4;
15552 SDValue Offset = N->getOperand(OffsetPos);
15553
15554 // Not an unpacked vector, bail out.
15555 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
15556 return SDValue();
15557
15558 // Extend the unpacked offset vector to 64-bit lanes.
15559 SDLoc DL(N);
15561 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
15562 // Replace the offset operand with the 64-bit one.
15563 Ops[OffsetPos] = Offset;
15564
15565 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
15566}
15567
15568/// Combines a node carrying the intrinsic
15569/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
15570/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
15571/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
15572/// sve gather prefetch instruction with vector plus immediate addressing mode.
15574 unsigned ScalarSizeInBytes) {
15575 const unsigned ImmPos = 4, OffsetPos = 3;
15576 // No need to combine the node if the immediate is valid...
15578 return SDValue();
15579
15580 // ...otherwise swap the offset base with the offset...
15581 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
15582 std::swap(Ops[ImmPos], Ops[OffsetPos]);
15583 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
15584 // `aarch64_sve_prfb_gather_uxtw_index`.
15585 SDLoc DL(N);
15586 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
15587 MVT::i64);
15588
15589 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
15590}
15591
15593 DAGCombinerInfo &DCI) const {
15594 SelectionDAG &DAG = DCI.DAG;
15595 switch (N->getOpcode()) {
15596 default:
15597 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
15598 break;
15599 case ISD::ABS:
15600 return performABSCombine(N, DAG, DCI, Subtarget);
15601 case ISD::ADD:
15602 case ISD::SUB:
15603 return performAddSubCombine(N, DCI, DAG);
15604 case ISD::XOR:
15605 return performXorCombine(N, DAG, DCI, Subtarget);
15606 case ISD::MUL:
15607 return performMulCombine(N, DAG, DCI, Subtarget);
15608 case ISD::SINT_TO_FP:
15609 case ISD::UINT_TO_FP:
15610 return performIntToFpCombine(N, DAG, Subtarget);
15611 case ISD::FP_TO_SINT:
15612 case ISD::FP_TO_UINT:
15613 return performFpToIntCombine(N, DAG, DCI, Subtarget);
15614 case ISD::FDIV:
15615 return performFDivCombine(N, DAG, DCI, Subtarget);
15616 case ISD::OR:
15617 return performORCombine(N, DCI, Subtarget);
15618 case ISD::AND:
15619 return performANDCombine(N, DCI);
15620 case ISD::SRL:
15621 return performSRLCombine(N, DCI);
15623 return performIntrinsicCombine(N, DCI, Subtarget);
15624 case ISD::ANY_EXTEND:
15625 case ISD::ZERO_EXTEND:
15626 case ISD::SIGN_EXTEND:
15627 return performExtendCombine(N, DCI, DAG);
15629 return performSignExtendInRegCombine(N, DCI, DAG);
15630 case ISD::TRUNCATE:
15631 return performVectorTruncateCombine(N, DCI, DAG);
15633 return performConcatVectorsCombine(N, DCI, DAG);
15634 case ISD::SELECT:
15635 return performSelectCombine(N, DCI);
15636 case ISD::VSELECT:
15637 return performVSelectCombine(N, DCI.DAG);
15638 case ISD::LOAD:
15639 if (performTBISimplification(N->getOperand(1), DCI, DAG))
15640 return SDValue(N, 0);
15641 break;
15642 case ISD::STORE:
15643 return performSTORECombine(N, DCI, DAG, Subtarget);
15644 case ISD::MGATHER:
15645 case ISD::MSCATTER:
15647 case AArch64ISD::BRCOND:
15648 return performBRCONDCombine(N, DCI, DAG);
15649 case AArch64ISD::TBNZ:
15650 case AArch64ISD::TBZ:
15651 return performTBZCombine(N, DCI, DAG);
15652 case AArch64ISD::CSEL:
15653 return performCONDCombine(N, DCI, DAG, 2, 3);
15654 case AArch64ISD::DUP:
15655 return performPostLD1Combine(N, DCI, false);
15656 case AArch64ISD::NVCAST:
15657 return performNVCASTCombine(N);
15658 case AArch64ISD::UZP1:
15659 return performUzpCombine(N, DAG);
15661 return performPostLD1Combine(N, DCI, true);
15663 return performExtractVectorEltCombine(N, DAG);
15664 case ISD::VECREDUCE_ADD:
15665 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
15668 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
15669 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
15670 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
15671 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
15672 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
15673 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
15674 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
15675 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
15676 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
15677 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
15678 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
15679 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
15680 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
15681 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
15682 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
15683 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
15684 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
15686 case Intrinsic::aarch64_neon_ld2:
15687 case Intrinsic::aarch64_neon_ld3:
15688 case Intrinsic::aarch64_neon_ld4:
15689 case Intrinsic::aarch64_neon_ld1x2:
15690 case Intrinsic::aarch64_neon_ld1x3:
15691 case Intrinsic::aarch64_neon_ld1x4:
15692 case Intrinsic::aarch64_neon_ld2lane:
15693 case Intrinsic::aarch64_neon_ld3lane:
15694 case Intrinsic::aarch64_neon_ld4lane:
15695 case Intrinsic::aarch64_neon_ld2r:
15696 case Intrinsic::aarch64_neon_ld3r:
15697 case Intrinsic::aarch64_neon_ld4r:
15698 case Intrinsic::aarch64_neon_st2:
15699 case Intrinsic::aarch64_neon_st3:
15700 case Intrinsic::aarch64_neon_st4:
15701 case Intrinsic::aarch64_neon_st1x2:
15702 case Intrinsic::aarch64_neon_st1x3:
15703 case Intrinsic::aarch64_neon_st1x4:
15704 case Intrinsic::aarch64_neon_st2lane:
15705 case Intrinsic::aarch64_neon_st3lane:
15706 case Intrinsic::aarch64_neon_st4lane:
15707 return performNEONPostLDSTCombine(N, DCI, DAG);
15708 case Intrinsic::aarch64_sve_ldnt1:
15709 return performLDNT1Combine(N, DAG);
15710 case Intrinsic::aarch64_sve_ld1rq:
15712 case Intrinsic::aarch64_sve_ld1ro:
15714 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
15716 case Intrinsic::aarch64_sve_ldnt1_gather:
15718 case Intrinsic::aarch64_sve_ldnt1_gather_index:
15719 return performGatherLoadCombine(N, DAG,
15721 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
15723 case Intrinsic::aarch64_sve_ld1:
15725 case Intrinsic::aarch64_sve_ldnf1:
15727 case Intrinsic::aarch64_sve_ldff1:
15729 case Intrinsic::aarch64_sve_st1:
15730 return performST1Combine(N, DAG);
15731 case Intrinsic::aarch64_sve_stnt1:
15732 return performSTNT1Combine(N, DAG);
15733 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
15735 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
15737 case Intrinsic::aarch64_sve_stnt1_scatter:
15739 case Intrinsic::aarch64_sve_stnt1_scatter_index:
15741 case Intrinsic::aarch64_sve_ld1_gather:
15743 case Intrinsic::aarch64_sve_ld1_gather_index:
15744 return performGatherLoadCombine(N, DAG,
15746 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
15748 /*OnlyPackedOffsets=*/false);
15749 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
15751 /*OnlyPackedOffsets=*/false);
15752 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
15753 return performGatherLoadCombine(N, DAG,
15755 /*OnlyPackedOffsets=*/false);
15756 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
15757 return performGatherLoadCombine(N, DAG,
15759 /*OnlyPackedOffsets=*/false);
15760 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
15762 case Intrinsic::aarch64_sve_ldff1_gather:
15764 case Intrinsic::aarch64_sve_ldff1_gather_index:
15765 return performGatherLoadCombine(N, DAG,
15767 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
15768 return performGatherLoadCombine(N, DAG,
15770 /*OnlyPackedOffsets=*/false);
15771 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
15772 return performGatherLoadCombine(N, DAG,
15774 /*OnlyPackedOffsets=*/false);
15775 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
15776 return performGatherLoadCombine(N, DAG,
15778 /*OnlyPackedOffsets=*/false);
15779 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
15780 return performGatherLoadCombine(N, DAG,
15782 /*OnlyPackedOffsets=*/false);
15783 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
15784 return performGatherLoadCombine(N, DAG,
15786 case Intrinsic::aarch64_sve_st1_scatter:
15788 case Intrinsic::aarch64_sve_st1_scatter_index:
15790 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
15792 /*OnlyPackedOffsets=*/false);
15793 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
15795 /*OnlyPackedOffsets=*/false);
15796 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
15797 return performScatterStoreCombine(N, DAG,
15799 /*OnlyPackedOffsets=*/false);
15800 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
15801 return performScatterStoreCombine(N, DAG,
15803 /*OnlyPackedOffsets=*/false);
15804 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
15806 case Intrinsic::aarch64_sve_tuple_get: {
15807 SDLoc DL(N);
15808 SDValue Chain = N->getOperand(0);
15809 SDValue Src1 = N->getOperand(2);
15810 SDValue Idx = N->getOperand(3);
15811
15812 uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
15813 EVT ResVT = N->getValueType(0);
15814 uint64_t NumLanes = ResVT.getVectorElementCount().getKnownMinValue();
15816 SDValue Val =
15818 return DAG.getMergeValues({Val, Chain}, DL);
15819 }
15820 case Intrinsic::aarch64_sve_tuple_set: {
15821 SDLoc DL(N);
15822 SDValue Chain = N->getOperand(0);
15823 SDValue Tuple = N->getOperand(2);
15824 SDValue Idx = N->getOperand(3);
15825 SDValue Vec = N->getOperand(4);
15826
15827 EVT TupleVT = Tuple.getValueType();
15828 uint64_t TupleLanes = TupleVT.getVectorElementCount().getKnownMinValue();
15829
15830 uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
15831 uint64_t NumLanes =
15833
15834 if ((TupleLanes % NumLanes) != 0)
15835 report_fatal_error("invalid tuple vector!");
15836
15837 uint64_t NumVecs = TupleLanes / NumLanes;
15838
15840 for (unsigned I = 0; I < NumVecs; ++I) {
15841 if (I == IdxConst)
15842 Opnds.push_back(Vec);
15843 else {
15845 Opnds.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
15846 Vec.getValueType(), Tuple, ExtIdx));
15847 }
15848 }
15849 SDValue Concat =
15850 DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds);
15851 return DAG.getMergeValues({Concat, Chain}, DL);
15852 }
15853 case Intrinsic::aarch64_sve_tuple_create2:
15854 case Intrinsic::aarch64_sve_tuple_create3:
15855 case Intrinsic::aarch64_sve_tuple_create4: {
15856 SDLoc DL(N);
15857 SDValue Chain = N->getOperand(0);
15858
15860 for (unsigned I = 2; I < N->getNumOperands(); ++I)
15861 Opnds.push_back(N->getOperand(I));
15862
15863 EVT VT = Opnds[0].getValueType();
15867 (N->getNumOperands() - 2));
15869 return DAG.getMergeValues({Concat, Chain}, DL);
15870 }
15871 case Intrinsic::aarch64_sve_ld2:
15872 case Intrinsic::aarch64_sve_ld3:
15873 case Intrinsic::aarch64_sve_ld4: {
15874 SDLoc DL(N);
15875 SDValue Chain = N->getOperand(0);
15876 SDValue Mask = N->getOperand(2);
15877 SDValue BasePtr = N->getOperand(3);
15878 SDValue LoadOps[] = {Chain, Mask, BasePtr};
15879 unsigned IntrinsicID =
15880 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
15881 SDValue Result =
15882 LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
15883 return DAG.getMergeValues({Result, Chain}, DL);
15884 }
15885 default:
15886 break;
15887 }
15888 break;
15889 case ISD::GlobalAddress:
15890 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
15891 }
15892 return SDValue();
15893}
15894
15895// Check if the return value is used as only a return value, as otherwise
15896// we can't perform a tail-call. In particular, we need to check for
15897// target ISD nodes that are returns and any other "odd" constructs
15898// that the generic analysis code won't necessarily catch.
15899bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
15900 SDValue &Chain) const {
15901 if (N->getNumValues() != 1)
15902 return false;
15903 if (!N->hasNUsesOfValue(1, 0))
15904 return false;
15905
15906 SDValue TCChain = Chain;
15907 SDNode *Copy = *N->use_begin();
15908 if (Copy->getOpcode() == ISD::CopyToReg) {
15909 // If the copy has a glue operand, we conservatively assume it isn't safe to
15910 // perform a tail call.
15911 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
15912 MVT::Glue)
15913 return false;
15914 TCChain = Copy->getOperand(0);
15915 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
15916 return false;
15917
15918 bool HasRet = false;
15919 for (SDNode *Node : Copy->uses()) {
15920 if (Node->getOpcode() != AArch64ISD::RET_FLAG)
15921 return false;
15922 HasRet = true;
15923 }
15924
15925 if (!HasRet)
15926 return false;
15927
15928 Chain = TCChain;
15929 return true;
15930}
15931
15932// Return whether the an instruction can potentially be optimized to a tail
15933// call. This will cause the optimizers to attempt to move, or duplicate,
15934// return instructions to help enable tail call optimizations for this
15935// instruction.
15936bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
15937 return CI->isTailCall();
15938}
15939
15940bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
15941 SDValue &Offset,
15943 bool &IsInc,
15944 SelectionDAG &DAG) const {
15945 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
15946 return false;
15947
15948 Base = Op->getOperand(0);
15949 // All of the indexed addressing mode instructions take a signed
15950 // 9 bit immediate offset.
15951 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
15952 int64_t RHSC = RHS->getSExtValue();
15953 if (Op->getOpcode() == ISD::SUB)
15954 RHSC = -(uint64_t)RHSC;
15955 if (!isInt<9>(RHSC))
15956 return false;
15957 IsInc = (Op->getOpcode() == ISD::ADD);
15958 Offset = Op->getOperand(1);
15959 return true;
15960 }
15961 return false;
15962}
15963
15964bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
15965 SDValue &Offset,
15967 SelectionDAG &DAG) const {
15968 EVT VT;
15969 SDValue Ptr;
15970 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
15971 VT = LD->getMemoryVT();
15972 Ptr = LD->getBasePtr();
15973 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
15974 VT = ST->getMemoryVT();
15975 Ptr = ST->getBasePtr();
15976 } else
15977 return false;
15978
15979 bool IsInc;
15980 if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
15981 return false;
15983 return true;
15984}
15985
15986bool AArch64TargetLowering::getPostIndexedAddressParts(
15987 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
15988 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
15989 EVT VT;
15990 SDValue Ptr;
15991 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
15992 VT = LD->getMemoryVT();
15993 Ptr = LD->getBasePtr();
15994 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
15995 VT = ST->getMemoryVT();
15996 Ptr = ST->getBasePtr();
15997 } else
15998 return false;
15999
16000 bool IsInc;
16001 if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
16002 return false;
16003 // Post-indexing updates the base, so it's not a valid transform
16004 // if that's not the same as the load's pointer.
16005 if (Ptr != Base)
16006 return false;
16008 return true;
16009}
16010
16012 SelectionDAG &DAG) {
16013 SDLoc DL(N);
16014 SDValue Op = N->getOperand(0);
16015
16016 if (N->getValueType(0) != MVT::i16 ||
16017 (Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16))
16018 return;
16019
16020 Op = SDValue(
16021 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
16022 DAG.getUNDEF(MVT::i32), Op,
16023 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
16024 0);
16025 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
16026 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
16027}
16028
16031 SelectionDAG &DAG, unsigned InterOp,
16032 unsigned AcrossOp) {
16033 EVT LoVT, HiVT;
16034 SDValue Lo, Hi;
16035 SDLoc dl(N);
16036 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
16037 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
16038 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
16040 Results.push_back(SplitVal);
16041}
16042
16043static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
16044 SDLoc DL(N);
16047 DAG.getNode(ISD::SRL, DL, MVT::i128, N,
16048 DAG.getConstant(64, DL, MVT::i64)));
16049 return std::make_pair(Lo, Hi);
16050}
16051
16052void AArch64TargetLowering::ReplaceExtractSubVectorResults(
16054 SDValue In = N->getOperand(0);
16055 EVT InVT = In.getValueType();
16056
16057 // Common code will handle these just fine.
16058 if (!InVT.isScalableVector() || !InVT.isInteger())
16059 return;
16060
16061 SDLoc DL(N);
16062 EVT VT = N->getValueType(0);
16063
16064 // The following checks bail if this is not a halving operation.
16065
16067
16068 if (InVT.getVectorElementCount() != (ResEC * 2))
16069 return;
16070
16071 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
16072 if (!CIndex)
16073 return;
16074
16075 unsigned Index = CIndex->getZExtValue();
16076 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
16077 return;
16078
16079 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
16081
16082 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
16083 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
16084}
16085
16086// Create an even/odd pair of X registers holding integer value V.
16088 SDLoc dl(V.getNode());
16089 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
16091 DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
16092 dl, MVT::i64);
16093 if (DAG.getDataLayout().isBigEndian())
16094 std::swap (VLo, VHi);
16095 SDValue RegClass =
16096 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
16097 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
16098 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
16099 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
16100 return SDValue(
16101 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
16102}
16103
16106 SelectionDAG &DAG,
16107 const AArch64Subtarget *Subtarget) {
16108 assert(N->getValueType(0) == MVT::i128 &&
16109 "AtomicCmpSwap on types less than 128 should be legal");
16110
16111 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
16112 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
16113 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
16114 SDValue Ops[] = {
16115 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
16116 createGPRPairNode(DAG, N->getOperand(3)), // Store value
16117 N->getOperand(1), // Ptr
16118 N->getOperand(0), // Chain in
16119 };
16120
16121 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
16122
16123 unsigned Opcode;
16124 switch (MemOp->getOrdering()) {
16126 Opcode = AArch64::CASPX;
16127 break;
16129 Opcode = AArch64::CASPAX;
16130 break;
16132 Opcode = AArch64::CASPLX;
16133 break;
16136 Opcode = AArch64::CASPALX;
16137 break;
16138 default:
16139 llvm_unreachable("Unexpected ordering!");
16140 }
16141
16143 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
16145
16146 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
16147 if (DAG.getDataLayout().isBigEndian())
16150 SDValue(CmpSwap, 0));
16152 SDValue(CmpSwap, 0));
16153 Results.push_back(
16154 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
16155 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
16156 return;
16157 }
16158
16159 auto Desired = splitInt128(N->getOperand(2), DAG);
16160 auto New = splitInt128(N->getOperand(3), DAG);
16161 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
16162 New.first, New.second, N->getOperand(0)};
16164 AArch64::CMP_SWAP_128, SDLoc(N),
16166
16167 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
16169
16171 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
16172 Results.push_back(SDValue(CmpSwap, 3));
16173}
16174
16175void AArch64TargetLowering::ReplaceNodeResults(
16177 switch (N->getOpcode()) {
16178 default:
16179 llvm_unreachable("Don't know how to custom expand this");
16180 case ISD::BITCAST:
16182 return;
16183 case ISD::VECREDUCE_ADD:
16188 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
16189 return;
16190
16191 case ISD::CTPOP:
16192 if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG))
16193 Results.push_back(Result);
16194 return;
16195 case AArch64ISD::SADDV:
16197 return;
16198 case AArch64ISD::UADDV:
16200 return;
16201 case AArch64ISD::SMINV:
16203 return;
16204 case AArch64ISD::UMINV:
16206 return;
16207 case AArch64ISD::SMAXV:
16209 return;
16210 case AArch64ISD::UMAXV:
16212 return;
16213 case ISD::FP_TO_UINT:
16214 case ISD::FP_TO_SINT:
16215 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
16216 // Let normal code take care of it by not adding anything to Results.
16217 return;
16219 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
16220 return;
16221 case ISD::LOAD: {
16223 "unexpected load's value type");
16225 if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) {
16226 // Non-volatile loads are optimized later in AArch64's load/store
16227 // optimizer.
16228 return;
16229 }
16230
16233 DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
16234 {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(),
16235 LoadNode->getMemOperand());
16236
16238 Result.getValue(0), Result.getValue(1));
16239 Results.append({Pair, Result.getValue(2) /* Chain */});
16240 return;
16241 }
16243 ReplaceExtractSubVectorResults(N, Results, DAG);
16244 return;
16246 EVT VT = N->getValueType(0);
16247 assert((VT == MVT::i8 || VT == MVT::i16) &&
16248 "custom lowering for unexpected type");
16249
16250 ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0));
16251 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
16252 switch (IntID) {
16253 default:
16254 return;
16255 case Intrinsic::aarch64_sve_clasta_n: {
16256 SDLoc DL(N);
16257 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
16258 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
16259 N->getOperand(1), Op2, N->getOperand(3));
16260 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
16261 return;
16262 }
16263 case Intrinsic::aarch64_sve_clastb_n: {
16264 SDLoc DL(N);
16265 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
16266 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
16267 N->getOperand(1), Op2, N->getOperand(3));
16268 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
16269 return;
16270 }
16271 case Intrinsic::aarch64_sve_lasta: {
16272 SDLoc DL(N);
16273 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
16274 N->getOperand(1), N->getOperand(2));
16275 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
16276 return;
16277 }
16278 case Intrinsic::aarch64_sve_lastb: {
16279 SDLoc DL(N);
16280 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
16281 N->getOperand(1), N->getOperand(2));
16282 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
16283 return;
16284 }
16285 }
16286 }
16287 }
16288}
16289
16291 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
16293 return true;
16294}
16295
16296unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
16297 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
16298 // reciprocal if there are three or more FDIVs.
16299 return 3;
16300}
16301
16304 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
16305 // v4i16, v2i32 instead of to promote.
16306 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
16307 VT == MVT::v1f32)
16308 return TypeWidenVector;
16309
16311}
16312
16313// Loads and stores less than 128-bits are already atomic; ones above that
16314// are doomed anyway, so defer to the default libcall and blame the OS when
16315// things go wrong.
16317 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
16318 return Size == 128;
16319}
16320
16321// Loads and stores less than 128-bits are already atomic; ones above that
16322// are doomed anyway, so defer to the default libcall and blame the OS when
16323// things go wrong.
16329
16330// For the real atomic operations, we have ldxr/stxr up to 128 bits,
16333 if (AI->isFloatingPointOperation())
16335
16336 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
16337 if (Size > 128) return AtomicExpansionKind::None;
16338 // Nand not supported in LSE.
16340 // Leave 128 bits to LLSC.
16341 if (Subtarget->hasLSE() && Size < 128)
16343 if (Subtarget->outlineAtomics() && Size < 128) {
16344 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
16345 // Don't outline them unless
16346 // (1) high level <atomic> support approved:
16347 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
16348 // (2) low level libgcc and compiler-rt support implemented by:
16349 // min/max outline atomics helpers
16350 if (AI->getOperation() != AtomicRMWInst::Min &&
16355 }
16356 }
16358}
16359
16362 AtomicCmpXchgInst *AI) const {
16363 // If subtarget has LSE, leave cmpxchg intact for codegen.
16364 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
16366 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
16367 // implement cmpxchg without spilling. If the address being exchanged is also
16368 // on the stack and close enough to the spill slot, this can lead to a
16369 // situation where the monitor always gets cleared and the atomic operation
16370 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
16371 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
16374}
16375
16377 AtomicOrdering Ord) const {
16378 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
16379 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
16380 bool IsAcquire = isAcquireOrStronger(Ord);
16381
16382 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
16383 // intrinsic must return {i64, i64} and we have to recombine them into a
16384 // single i128 here.
16385 if (ValTy->getPrimitiveSizeInBits() == 128) {
16387 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
16389
16390 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
16391 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
16392
16393 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
16394 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
16395 Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
16396 Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
16397 return Builder.CreateOr(
16398 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
16399 }
16400
16401 Type *Tys[] = { Addr->getType() };
16403 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
16405
16406 Type *EltTy = cast<PointerType>(Addr->getType())->getElementType();
16407
16408 const DataLayout &DL = M->getDataLayout();
16409 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy));
16410 Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);
16411
16412 return Builder.CreateBitCast(Trunc, EltTy);
16413}
16414
16416 IRBuilder<> &Builder) const {
16417 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
16418 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
16419}
16420
16422 Value *Val, Value *Addr,
16423 AtomicOrdering Ord) const {
16424 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
16425 bool IsRelease = isReleaseOrStronger(Ord);
16426
16427 // Since the intrinsics must have legal type, the i128 intrinsics take two
16428 // parameters: "i64, i64". We must marshal Val into the appropriate form
16429 // before the call.
16430 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
16432 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
16434 Type *Int64Ty = Type::getInt64Ty(M->getContext());
16435
16436 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
16437 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
16438 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
16439 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
16440 }
16441
16443 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
16444 Type *Tys[] = { Addr->getType() };
16446
16447 const DataLayout &DL = M->getDataLayout();
16448 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
16449 Val = Builder.CreateBitCast(Val, IntValTy);
16450
16451 return Builder.CreateCall(Stxr,
16452 {Builder.CreateZExtOrBitCast(
16453 Val, Stxr->getFunctionType()->getParamType(0)),
16454 Addr});
16455}
16456
16458 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
16459 if (Ty->isArrayTy())
16460 return true;
16461
16462 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
16463 if (TySize.isScalable() && TySize.getKnownMinSize() > 128)
16464 return true;
16465
16466 return false;
16467}
16468
16469bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
16470 EVT) const {
16471 return false;
16472}
16473
16474static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
16475 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
16477 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
16478 return IRB.CreatePointerCast(
16479 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
16480 Offset),
16481 IRB.getInt8PtrTy()->getPointerTo(0));
16482}
16483
16485 // Android provides a fixed TLS slot for the stack cookie. See the definition
16486 // of TLS_SLOT_STACK_GUARD in
16487 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
16488 if (Subtarget->isTargetAndroid())
16489 return UseTlsOffset(IRB, 0x28);
16490
16491 // Fuchsia is similar.
16492 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
16493 if (Subtarget->isTargetFuchsia())
16494 return UseTlsOffset(IRB, -0x10);
16495
16497}
16498
16500 // MSVC CRT provides functionalities for stack protection.
16501 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
16502 // MSVC CRT has a global variable holding security cookie.
16503 M.getOrInsertGlobal("__security_cookie",
16504 Type::getInt8PtrTy(M.getContext()));
16505
16506 // MSVC CRT has a function to validate security cookie.
16507 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
16508 "__security_check_cookie", Type::getVoidTy(M.getContext()),
16509 Type::getInt8PtrTy(M.getContext()));
16510 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
16511 F->setCallingConv(CallingConv::Win64);
16512 F->addAttribute(1, Attribute::AttrKind::InReg);
16513 }
16514 return;
16515 }
16517}
16518
16520 // MSVC CRT has a global variable holding security cookie.
16521 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
16522 return M.getGlobalVariable("__security_cookie");
16524}
16525
16527 // MSVC CRT has a function to validate security cookie.
16528 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
16529 return M.getFunction("__security_check_cookie");
16531}
16532
16534 // Android provides a fixed TLS slot for the SafeStack pointer. See the
16535 // definition of TLS_SLOT_SAFESTACK in
16536 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
16537 if (Subtarget->isTargetAndroid())
16538 return UseTlsOffset(IRB, 0x48);
16539
16540 // Fuchsia is similar.
16541 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
16542 if (Subtarget->isTargetFuchsia())
16543 return UseTlsOffset(IRB, -0x8);
16544
16546}
16547
16549 const Instruction &AndI) const {
16550 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
16551 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
16552 // may be beneficial to sink in other cases, but we would have to check that
16553 // the cmp would not get folded into the br to form a cbz for these to be
16554 // beneficial.
16555 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
16556 if (!Mask)
16557 return false;
16558 return Mask->getValue().isPowerOf2();
16559}
16560
16564 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
16565 SelectionDAG &DAG) const {
16566 // Does baseline recommend not to perform the fold by default?
16568 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
16569 return false;
16570 // Else, if this is a vector shift, prefer 'shl'.
16571 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
16572}
16573
16575 SDNode *N) const {
16577 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
16578 return false;
16579 return true;
16580}
16581
16583 // Update IsSplitCSR in AArch64unctionInfo.
16584 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
16585 AFI->setIsSplitCSR(true);
16586}
16587
16589 MachineBasicBlock *Entry,
16590 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
16591 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16592 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
16593 if (!IStart)
16594 return;
16595
16596 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
16597 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
16599 for (const MCPhysReg *I = IStart; *I; ++I) {
16600 const TargetRegisterClass *RC = nullptr;
16601 if (AArch64::GPR64RegClass.contains(*I))
16602 RC = &AArch64::GPR64RegClass;
16603 else if (AArch64::FPR64RegClass.contains(*I))
16604 RC = &AArch64::FPR64RegClass;
16605 else
16606 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
16607
16608 Register NewVR = MRI->createVirtualRegister(RC);
16609 // Create copy from CSR to a virtual register.
16610 // FIXME: this currently does not emit CFI pseudo-instructions, it works
16611 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
16612 // nounwind. If we want to generalize this later, we may need to emit
16613 // CFI pseudo-instructions.
16614 assert(Entry->getParent()->getFunction().hasFnAttribute(
16615 Attribute::NoUnwind) &&
16616 "Function should be nounwind in insertCopiesSplitCSR!");
16617 Entry->addLiveIn(*I);
16618 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
16619 .addReg(*I);
16620
16621 // Insert the copy-back instructions right before the terminator.
16622 for (auto *Exit : Exits)
16623 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
16624 TII->get(TargetOpcode::COPY), *I)
16625 .addReg(NewVR);
16626 }
16627}
16628
16630 // Integer division on AArch64 is expensive. However, when aggressively
16631 // optimizing for code size, we prefer to use a div instruction, as it is
16632 // usually smaller than the alternative sequence.
16633 // The exception to this is vector division. Since AArch64 doesn't have vector
16634 // integer division, leaving the division as-is is a loss even in terms of
16635 // size, because it will have to be scalarized, while the alternative code
16636 // sequence can be performed in vector form.
16637 bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
16638 return OptSize && !VT.isVector();
16639}
16640
16642 // We want inc-of-add for scalars and sub-of-not for vectors.
16643 return VT.isScalarInteger();
16644}
16645
16647 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
16648}
16649
16650unsigned
16652 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
16653 return getPointerTy(DL).getSizeInBits();
16654
16655 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
16656}
16657
16658void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
16661}
16662
16663// Unlike X86, we let frame lowering assign offsets to all catch objects.
16665 return false;
16666}
16667
16668bool AArch64TargetLowering::shouldLocalize(
16669 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
16670 switch (MI.getOpcode()) {
16671 case TargetOpcode::G_GLOBAL_VALUE: {
16672 // On Darwin, TLS global vars get selected into function calls, which
16673 // we don't want localized, as they can get moved into the middle of a
16674 // another call sequence.
16675 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
16676 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
16677 return false;
16678 break;
16679 }
16680 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
16681 // localizable.
16682 case AArch64::ADRP:
16683 case AArch64::G_ADD_LOW:
16684 return true;
16685 default:
16686 break;
16687 }
16689}
16690
16692 if (isa<ScalableVectorType>(Inst.getType()))
16693 return true;
16694
16695 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
16697 return true;
16698
16699 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
16700 if (isa<ScalableVectorType>(AI->getAllocatedType()))
16701 return true;
16702 }
16703
16704 return false;
16705}
16706
16707// Return the largest legal scalable vector type that matches VT's element type.
16711 "Expected legal fixed length vector!");
16712 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
16713 default:
16714 llvm_unreachable("unexpected element type for SVE container");
16715 case MVT::i8:
16716 return EVT(MVT::nxv16i8);
16717 case MVT::i16:
16718 return EVT(MVT::nxv8i16);
16719 case MVT::i32:
16720 return EVT(MVT::nxv4i32);
16721 case MVT::i64:
16722 return EVT(MVT::nxv2i64);
16723 case MVT::f16:
16724 return EVT(MVT::nxv8f16);
16725 case MVT::f32:
16726 return EVT(MVT::nxv4f32);
16727 case MVT::f64:
16728 return EVT(MVT::nxv2f64);
16729 }
16730}
16731
16732// Return a PTRUE with active lanes corresponding to the extent of VT.
16734 EVT VT) {
16737 "Expected legal fixed length vector!");
16738
16739 int PgPattern;
16740 switch (VT.getVectorNumElements()) {
16741 default:
16742 llvm_unreachable("unexpected element count for SVE predicate");
16743 case 1:
16744 PgPattern = AArch64SVEPredPattern::vl1;
16745 break;
16746 case 2:
16747 PgPattern = AArch64SVEPredPattern::vl2;
16748 break;
16749 case 4:
16750 PgPattern = AArch64SVEPredPattern::vl4;
16751 break;
16752 case 8:
16753 PgPattern = AArch64SVEPredPattern::vl8;
16754 break;
16755 case 16:
16756 PgPattern = AArch64SVEPredPattern::vl16;
16757 break;
16758 case 32:
16759 PgPattern = AArch64SVEPredPattern::vl32;
16760 break;
16761 case 64:
16762 PgPattern = AArch64SVEPredPattern::vl64;
16763 break;
16764 case 128:
16765 PgPattern = AArch64SVEPredPattern::vl128;
16766 break;
16767 case 256:
16768 PgPattern = AArch64SVEPredPattern::vl256;
16769 break;
16770 }
16771
16772 // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can
16773 // use AArch64SVEPredPattern::all, which can enable the use of unpredicated
16774 // variants of instructions when available.
16775
16776 MVT MaskVT;
16777 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
16778 default:
16779 llvm_unreachable("unexpected element type for SVE predicate");
16780 case MVT::i8:
16782 break;
16783 case MVT::i16:
16784 case MVT::f16:
16786 break;
16787 case MVT::i32:
16788 case MVT::f32:
16790 break;
16791 case MVT::i64:
16792 case MVT::f64:
16794 break;
16795 }
16796
16797 return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,
16799}
16800
16802 EVT VT) {
16804 "Expected legal scalable vector!");
16806 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
16807}
16808
16810 if (VT.isFixedLengthVector())
16811 return getPredicateForFixedLengthVector(DAG, DL, VT);
16812
16813 return getPredicateForScalableVector(DAG, DL, VT);
16814}
16815
16816// Grow V to consume an entire SVE register.
16818 assert(VT.isScalableVector() &&
16819 "Expected to convert into a scalable vector!");
16820 assert(V.getValueType().isFixedLengthVector() &&
16821 "Expected a fixed length vector operand!");
16822 SDLoc DL(V);
16823 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
16824 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
16825}
16826
16827// Shrink V so it's just big enough to maintain a VT's worth of data.
16830 "Expected to convert into a fixed length vector!");
16831 assert(V.getValueType().isScalableVector() &&
16832 "Expected a scalable vector operand!");
16833 SDLoc DL(V);
16834 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
16835 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
16836}
16837
16838// Convert all fixed length vector loads larger than NEON to masked_loads.
16839SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
16840 SDValue Op, SelectionDAG &DAG) const {
16841 auto Load = cast<LoadSDNode>(Op);
16842
16843 SDLoc DL(Op);
16844 EVT VT = Op.getValueType();
16846
16847 auto NewLoad = DAG.getMaskedLoad(
16848 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
16850 Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
16851 Load->getExtensionType());
16852
16853 auto Result = convertFromScalableVector(DAG, VT, NewLoad);
16854 SDValue MergedValues[2] = {Result, Load->getChain()};
16855 return DAG.getMergeValues(MergedValues, DL);
16856}
16857
16858// Convert all fixed length vector stores larger than NEON to masked_stores.
16859SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
16860 SDValue Op, SelectionDAG &DAG) const {
16861 auto Store = cast<StoreSDNode>(Op);
16862
16863 SDLoc DL(Op);
16864 EVT VT = Store->getValue().getValueType();
16866
16867 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
16868 return DAG.getMaskedStore(
16869 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
16870 getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
16871 Store->getMemOperand(), Store->getAddressingMode(),
16872 Store->isTruncatingStore());
16873}
16874
16875SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
16876 SDValue Op, SelectionDAG &DAG) const {
16877 SDLoc dl(Op);
16878 EVT VT = Op.getValueType();
16880
16881 bool Signed = Op.getOpcode() == ISD::SDIV;
16883
16884 // Scalable vector i32/i64 DIV is supported.
16885 if (EltVT == MVT::i32 || EltVT == MVT::i64)
16886 return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true);
16887
16888 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
16891 EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
16893
16894 // Convert the operands to scalable vectors.
16895 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
16896 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
16897
16898 // Extend the scalable operands.
16901 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0);
16902 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1);
16903 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0);
16904 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1);
16905
16906 // Convert back to fixed vectors so the DIV can be further lowered.
16911 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
16912 Op0Lo, Op1Lo);
16913 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
16914 Op0Hi, Op1Hi);
16915
16916 // Convert again to scalable vectors to truncate.
16921
16923}
16924
16925SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
16926 SDValue Op, SelectionDAG &DAG) const {
16927 EVT VT = Op.getValueType();
16928 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
16929
16930 SDLoc DL(Op);
16931 SDValue Val = Op.getOperand(0);
16933 Val = convertToScalableVector(DAG, ContainerVT, Val);
16934
16935 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
16937
16938 // Repeatedly unpack Val until the result is of the desired element type.
16939 switch (ContainerVT.getSimpleVT().SimpleTy) {
16940 default:
16941 llvm_unreachable("unimplemented container type");
16942 case MVT::nxv16i8:
16943 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
16944 if (VT.getVectorElementType() == MVT::i16)
16945 break;
16947 case MVT::nxv8i16:
16948 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
16949 if (VT.getVectorElementType() == MVT::i32)
16950 break;
16952 case MVT::nxv4i32:
16953 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
16954 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
16955 break;
16956 }
16957
16958 return convertFromScalableVector(DAG, VT, Val);
16959}
16960
16961SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
16962 SDValue Op, SelectionDAG &DAG) const {
16963 EVT VT = Op.getValueType();
16964 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
16965
16966 SDLoc DL(Op);
16967 SDValue Val = Op.getOperand(0);
16969 Val = convertToScalableVector(DAG, ContainerVT, Val);
16970
16971 // Repeatedly truncate Val until the result is of the desired element type.
16972 switch (ContainerVT.getSimpleVT().SimpleTy) {
16973 default:
16974 llvm_unreachable("unimplemented container type");
16975 case MVT::nxv2i64:
16976 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
16977 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
16978 if (VT.getVectorElementType() == MVT::i32)
16979 break;
16981 case MVT::nxv4i32:
16982 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
16983 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
16984 if (VT.getVectorElementType() == MVT::i16)
16985 break;
16987 case MVT::nxv8i16:
16988 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
16989 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
16990 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
16991 break;
16992 }
16993
16994 return convertFromScalableVector(DAG, VT, Val);
16995}
16996
16997// Convert vector operation 'Op' to an equivalent predicated operation whereby
16998// the original operation's type is used to construct a suitable predicate.
16999// NOTE: The results for inactive lanes are undefined.
17000SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
17001 SelectionDAG &DAG,
17002 unsigned NewOp,
17003 bool OverrideNEON) const {
17004 EVT VT = Op.getValueType();
17005 SDLoc DL(Op);
17006 auto Pg = getPredicateForVector(DAG, DL, VT);
17007
17008 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) {
17010
17011 // Create list of operands by converting existing ones to scalable types.
17013 for (const SDValue &V : Op->op_values()) {
17014 if (isa<CondCodeSDNode>(V)) {
17015 Operands.push_back(V);
17016 continue;
17017 }
17018
17019 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
17020 EVT VTArg = VTNode->getVT().getVectorElementType();
17021 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
17022 Operands.push_back(DAG.getValueType(NewVTArg));
17023 continue;
17024 }
17025
17026 assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) &&
17027 "Only fixed length vectors are supported!");
17028 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
17029 }
17030
17032 Operands.push_back(DAG.getUNDEF(ContainerVT));
17033
17035 return convertFromScalableVector(DAG, VT, ScalableRes);
17036 }
17037
17038 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
17039
17041 for (const SDValue &V : Op->op_values()) {
17042 assert((!V.getValueType().isVector() ||
17043 V.getValueType().isScalableVector()) &&
17044 "Only scalable vectors are supported!");
17045 Operands.push_back(V);
17046 }
17047
17049 Operands.push_back(DAG.getUNDEF(VT));
17050
17051 return DAG.getNode(NewOp, DL, VT, Operands);
17052}
17053
17054// If a fixed length vector operation has no side effects when applied to
17055// undefined elements, we can safely use scalable vectors to perform the same
17056// operation without needing to worry about predication.
17057SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
17058 SelectionDAG &DAG) const {
17059 EVT VT = Op.getValueType();
17060 assert(useSVEForFixedLengthVectorVT(VT) &&
17061 "Only expected to lower fixed length vector operation!");
17063
17064 // Create list of operands by converting existing ones to scalable types.
17066 for (const SDValue &V : Op->op_values()) {
17067 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
17068
17069 // Pass through non-vector operands.
17070 if (!V.getValueType().isVector()) {
17071 Ops.push_back(V);
17072 continue;
17073 }
17074
17075 // "cast" fixed length vector to a scalable vector.
17076 assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
17077 "Only fixed length vectors are supported!");
17078 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
17079 }
17080
17081 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
17082 return convertFromScalableVector(DAG, VT, ScalableRes);
17083}
17084
17085SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
17086 SelectionDAG &DAG) const {
17087 SDLoc DL(ScalarOp);
17088 SDValue AccOp = ScalarOp.getOperand(0);
17089 SDValue VecOp = ScalarOp.getOperand(1);
17090 EVT SrcVT = VecOp.getValueType();
17091 EVT ResVT = SrcVT.getVectorElementType();
17092
17094 if (SrcVT.isFixedLengthVector()) {
17097 }
17098
17100 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17101
17102 // Convert operands to Scalable.
17104 DAG.getUNDEF(ContainerVT), AccOp, Zero);
17105
17106 // Perform reduction.
17108 Pg, AccOp, VecOp);
17109
17110 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
17111}
17112
17113SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
17114 SelectionDAG &DAG) const {
17115 SDLoc DL(ReduceOp);
17116 SDValue Op = ReduceOp.getOperand(0);
17117 EVT OpVT = Op.getValueType();
17118 EVT VT = ReduceOp.getValueType();
17119
17120 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
17121 return SDValue();
17122
17124
17125 switch (ReduceOp.getOpcode()) {
17126 default:
17127 return SDValue();
17128 case ISD::VECREDUCE_OR:
17129 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
17130 case ISD::VECREDUCE_AND: {
17131 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
17132 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
17133 }
17134 case ISD::VECREDUCE_XOR: {
17135 SDValue ID =
17136 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
17137 SDValue Cntp =
17138 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
17139 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
17140 }
17141 }
17142
17143 return SDValue();
17144}
17145
17146SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
17148 SelectionDAG &DAG) const {
17149 SDLoc DL(ScalarOp);
17150 SDValue VecOp = ScalarOp.getOperand(0);
17151 EVT SrcVT = VecOp.getValueType();
17152
17153 if (useSVEForFixedLengthVectorVT(SrcVT, true)) {
17156 }
17157
17158 // UADDV always returns an i64 result.
17159 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
17160 SrcVT.getVectorElementType();
17161 EVT RdxVT = SrcVT;
17162 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
17164
17166 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
17168 Rdx, DAG.getConstant(0, DL, MVT::i64));
17169
17170 // The VEC_REDUCE nodes expect an element size result.
17171 if (ResVT != ScalarOp.getValueType())
17172 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
17173
17174 return Res;
17175}
17176
17177SDValue
17178AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
17179 SelectionDAG &DAG) const {
17180 EVT VT = Op.getValueType();
17181 SDLoc DL(Op);
17182
17183 EVT InVT = Op.getOperand(1).getValueType();
17185 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
17186 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
17187
17188 // Convert the mask to a predicated (NOTE: We don't need to worry about
17189 // inactive lanes since VSELECT is safe when given undefined elements).
17190 EVT MaskVT = Op.getOperand(0).getValueType();
17192 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
17194 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
17195
17197 Mask, Op1, Op2);
17198
17199 return convertFromScalableVector(DAG, VT, ScalableRes);
17200}
17201
17202SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
17203 SDValue Op, SelectionDAG &DAG) const {
17204 SDLoc DL(Op);
17205 EVT InVT = Op.getOperand(0).getValueType();
17207
17208 assert(useSVEForFixedLengthVectorVT(InVT) &&
17209 "Only expected to lower fixed length vector operation!");
17210 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
17211 "Expected integer result of the same bit length as the inputs!");
17212
17213 // Expand floating point vector comparisons.
17214 if (InVT.isFloatingPoint())
17215 return SDValue();
17216
17217 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
17218 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
17219 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
17220
17221 EVT CmpVT = Pg.getValueType();
17223 {Pg, Op1, Op2, Op.getOperand(2)});
17224
17225 EVT PromoteVT = ContainerVT.changeTypeToInteger();
17226 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
17227 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
17228}
17229
17230SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
17231 SelectionDAG &DAG) const {
17232 SDLoc DL(Op);
17233 EVT InVT = Op.getValueType();
17234 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17235 (void)TLI;
17236
17237 assert(VT.isScalableVector() && TLI.isTypeLegal(VT) &&
17238 InVT.isScalableVector() && TLI.isTypeLegal(InVT) &&
17239 "Only expect to cast between legal scalable vector types!");
17241 (InVT.getVectorElementType() == MVT::i1) &&
17242 "Cannot cast between data and predicate scalable vector types!");
17243
17244 if (InVT == VT)
17245 return Op;
17246
17247 if (VT.getVectorElementType() == MVT::i1)
17248 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
17249
17251 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
17252 assert((VT == PackedVT || InVT == PackedInVT) &&
17253 "Cannot cast between unpacked scalable vector types!");
17254
17255 // Pack input if required.
17256 if (InVT != PackedInVT)
17258
17259 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
17260
17261 // Unpack result if required.
17262 if (VT != PackedVT)
17263 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
17264
17265 return Op;
17266}
unsigned const MachineRegisterInfo * MRI
if(Register::isVirtualRegister(Reg)) return MRI -> getRegClass(Reg) ->hasSuperClassEq(&AArch64::GPR64RegClass)
static unsigned MatchRegisterName(StringRef Name)
static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG)
NarrowVector - Given a value in the V128 register class, produce the equivalent value in the V64 regi...
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static std::pair< SDValue, SDValue > splitInt128(SDValue N, SelectionDAG &DAG)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static bool areExtractShuffleVectors(Value *Op1, Value *Op2)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static bool isREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
static SDValue performSRLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point divide by power of two into fixed-point to floating-point conversion.
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static bool isMergePassthruOpcode(unsigned Opc)
static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool canGuaranteeTCO(CallingConv::ID CC)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue performABSCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static bool isUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned PredOpc, SelectionDAG &DAG)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static bool isTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N)
Get rid of unnecessary NVCASTs (that don't change the type).
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, bool &FromHi)
An EXTR instruction is made up of two shifts, ORed together.
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static unsigned getIntrinsicID(const SDNode *N)
static bool IsSVECntIntrinsic(SDValue S)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT, unsigned &Opcode, bool IsGather, SelectionDAG &DAG)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG)
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static PredicateConstraint parsePredicateConstraint(StringRef Constraint)
static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static bool isSignExtended(SDNode *N, SelectionDAG &DAG)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
bool getGatherScatterIndexIsExtended(SDValue Index)
static bool isZeroExtended(SDNode *N, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool isCMN(SDValue Op, ISD::CondCode CC)
static SDValue tryCombineToEXTR(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
EXTR instruction extracts a contiguous chunk of bits from two existing registers viewed as a high/low...
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static Value * UseTlsOffset(IRBuilder<> &IRB, unsigned Offset)
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG)
#define LCALLNAME5(A, B)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle, SelectionDAG &DAG)
Combines a dup(sext/zext) node pattern into sext/zext(dup) making use of the vector SExt/ZExt rather ...
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
static const unsigned PerfectShuffleTable[6561+1]
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static bool isConstant(const MachineInstr &MI)
amdgpu Simplify well known AMD library false FunctionCallee Callee
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static const MCPhysReg GPRArgRegs[]
Function Alias Analysis Results
assume Assume Builder
This file contains the simple types necessary to represent the attributes associated with functions a...
SmallVector< MachineOperand, 4 > Cond
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< ShadowStackGC > C("shadow-stack", "Very portable GC for uncooperative code generators")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition Compiler.h:280
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
PropagateLiveness Given that RA is a live propagate it s liveness to any other values it uses(according to Uses). void DeadArgumentEliminationPass
else return RetTy
#define LLVM_DEBUG(X)
Definition Debug.h:122
uint64_t Align
uint64_t Offset
uint64_t Addr
uint32_t Index
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
#define RegName(no)
lazy value info
#define F(x, y, z)
Definition MD5.cpp:56
#define I(x, y, z)
Definition MD5.cpp:59
#define G(x, y, z)
Definition MD5.cpp:57
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
unsigned Reg
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool Enabled
Definition Statistic.cpp:50
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:169
static const int BlockSize
Definition TarWriter.cpp:33
This defines the Use class.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:458
static constexpr int Concat[]
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
static bool hasSVEArgsOrReturn(const MachineFunction *MF)
unsigned getPrefLoopLogAlignment() const
const AArch64RegisterInfo * getRegisterInfo() const override
unsigned getPrefFunctionLogAlignment() const
bool isMisaligned128StoreSlow() const
const AArch64InstrInfo * getInstrInfo() const override
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isCallingConvWin64(CallingConv::ID CC) const
unsigned getMinSVEVectorSizeInBits() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isXRegisterReserved(size_t i) const
bool predictableSelectIsExpensive() const
bool useSVEForFixedLengthVectors() const
bool hasCustomCallingConv() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns true if the given (atomic) store should be expanded by the IR-level AtomicExpand pass into an...
int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
Value * emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
Value * getIRStackGuard(IRBuilder<> &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
Value * getSafeStackPointerLocation(IRBuilder<> &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
EVT is not used in-tree, but is used by out-of-tree target.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, unsigned Align=1, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, bool *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override
Return true if SHIFT instructions should be expanded to SHIFT_PARTS instructions, and false if a libr...
Value * emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
APInt bitcastToAPInt() const
Definition APFloat.h:1133
bool isPosZero() const
Definition APFloat.h:1217
void dump() const
Definition APFloat.cpp:4854
Class for arbitrary precision integers.
Definition APInt.h:70
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:948
static APInt getAllOnesValue(unsigned numBits)
Get the all-ones value.
Definition APInt.h:567
unsigned countTrailingZeros() const
Count the number of trailing zero bits.
Definition APInt.h:1700
unsigned logBase2() const
Definition APInt.h:1816
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:369
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:469
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Get a value with low bits set.
Definition APInt.h:667
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Get a value with high bits set.
Definition APInt.h:655
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1643
an instruction to allocate memory on the stack
This class represents an incoming formal argument to a Function.
Definition Argument.h:29
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
@ Min
*p = old <signed v ? old : v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ UMax
*p = old >unsigned v ? old : v
@ Nand
*p = ~(old & v)
bool isFloatingPointOperation() const
BinOp getOperation() const
This is an SDNode representing atomic operations.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but may be faster.
LLVM Basic Block Representation.
Definition BasicBlock.h:59
const BlockAddress * getBlockAddress() const
A "pseudo-class" with methods for operating on BUILD_VECTORs.
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
unsigned getNextStackOffset() const
getNextStackOffset - Return the next stack offset such that all stack slots satisfy their alignment r...
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
CCValAssign - Represent assignment of one arg/retval to a location.
Value * getArgOperand(unsigned i) const
unsigned getNumArgOperands() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
This is the shared class of boolean and integer constants.
Definition Constants.h:77
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:133
uint64_t getZExtValue() const
This is an important base class in LLVM.
Definition Constant.h:41
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:111
bool isBigEndian() const
Definition DataLayout.h:241
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition DataLayout.h:500
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:33
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:65
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:643
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:682
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:228
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:239
const Function & getFunction() const
Definition Function.h:135
arg_iterator arg_begin()
Definition Function.h:762
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.h:345
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
bool hasExternalWeakLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This instruction inserts a single (scalar) element into a VectorType value.
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
static ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:283
LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:357
ScalarTy getKnownMinValue() const
Returns the minimum value this size can represent.
Definition TypeSize.h:293
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
This class is used to represent ISD::LOAD nodes.
MCRegisterInfo base class - We assume that the target defines a static array of MCRegisterDesc object...
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static mvt_range fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static mvt_range integer_valuetypes()
static mvt_range integer_fixedlen_vector_valuetypes()
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
static MVT getVectorVT(MVT VT, unsigned NumElements)
static mvt_range fp_fixedlen_vector_valuetypes()
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
static mvt_range fp_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setStackID(int ObjectIdx, uint8_t ID)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
void computeMaxCallFrameSize(const MachineFunction &MF)
Computes the maximum size of a callframe and the AdjustsStack property.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=nullptr)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MSCATTER node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
bool isVolatile() const
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
unsigned getAlignment() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.cpp:397
void dump() const
Definition Pass.cpp:131
Class to represent pointers.
Type * getElementType() const
Wrapper class representing virtual and physical registers.
Definition Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=None, int Offset=0, unsigned TargetFlags=0)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, uint64_t Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
void addCallSiteInfo(const SDNode *CallNode, CallSiteInfoImpl &&CallInfo)
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplatValue(SDValue V)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
bool isZeroEltSplat() const
Return true if all elements of this shuffle are the same value as the first element of exactly one so...
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
StackOffset is a class to represent an offset with 2 dimensions, named fixed and scalable,...
Definition TypeSize.h:130
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:57
std::enable_if_t< std::numeric_limits< T >::is_signed, bool > getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:511
LLVM_NODISCARD StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition StringRef.h:713
LLVM_NODISCARD size_t size() const
size - Get the string size.
Definition StringRef.h:160
Class to represent struct types.
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:366
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setTargetDAGCombine(ISD::NodeType NT)
Targets should invoke this method for each target independent node that they want to provide a custom...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual Value * getSafeStackPointerLocation(IRBuilder<> &IRB) const
Returns the target-specific address of the unsafe stack pointer.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilder<> &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
void setIndexedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
void setCondCodeAction(ISD::CondCode CC, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
void setIndexedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
virtual bool useLoadStackGuardNode() const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned EmitCallSiteInfo
The flag enables call site info production.
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fuse-fp-ops=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:45
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition Triple.h:582
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition Triple.h:555
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:80
ScalarTy getFixedSize() const
Definition TypeSize.h:421
ScalarTy getKnownMinSize() const
Definition TypeSize.h:422
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:198
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:235
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:229
@ FloatTyID
32-bit floating point type
Definition Type.h:59
@ DoubleTyID
64-bit floating point type
Definition Type.h:60
static Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:180
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition Type.cpp:249
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:122
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the six floating-point types.
Definition Type.h:163
ScalarTy getValue() const
Definition TypeSize.h:228
A Use represents the edge between a Value definition and its users.
Definition Use.h:44
const Use & getOperandUse(unsigned i) const
Definition User.h:182
Value * getOperand(unsigned i) const
Definition User.h:169
unsigned getNumOperands() const
Definition User.h:191
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:246
User * user_back()
Definition Value.h:410
Base class of all SIMD vector types.
Type * getElementType() const
Implementation for an ilist node.
Definition ilist_node.h:39
self_iterator getIterator()
Definition ilist_node.h:81
#define UINT64_MAX
Definition DataTypes.h:77
#define INT64_MAX
Definition DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:80
@ AArch64_SVE_VectorCall
Calling convention between AArch64 SVE functions.
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:87
@ Fast
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:42
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
@ C
C - The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:651
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:229
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition ISDOpcodes.h:954
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition ISDOpcodes.h:950
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:456
@ FLT_ROUNDS_
FLT_ROUNDS_ - Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest 2 Round to ...
Definition ISDOpcodes.h:772
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:243
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:527
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:615
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition ISDOpcodes.h:983
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:262
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:232
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition ISDOpcodes.h:863
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:681
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:460
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:192
@ GlobalAddress
Definition ISDOpcodes.h:71
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:688
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:513
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:371
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:589
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:248
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:790
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:222
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:72
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:675
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:430
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:558
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:94
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition ISDOpcodes.h:857
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition ISDOpcodes.h:808
@ BR_CC
BR_CC - Conditional branch.
Definition ISDOpcodes.h:905
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:307
@ BRIND
BRIND - Indirect branch.
Definition ISDOpcodes.h:884
@ BR_JT
BR_JT - Jumptable branch.
Definition ISDOpcodes.h:888
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:329
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:628
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:215
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:565
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition ISDOpcodes.h:979
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:303
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:570
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:606
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:550
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:541
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:505
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:196
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:678
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:643
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition ISDOpcodes.h:840
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:311
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition ISDOpcodes.h:873
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:696
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum or signed or unsigned integers.
Definition ISDOpcodes.h:575
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:775
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:637
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:429
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:87
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:423
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:445
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:422
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition ISDOpcodes.h:853
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:734
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:151
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:581
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:177
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:272
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:494
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:763
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:99
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:684
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition ISDOpcodes.h:974
@ BRCOND
BRCOND - Conditional branch.
Definition ISDOpcodes.h:898
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:664
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:59
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:470
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:185
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:485
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
bool match(Val *V, const Pattern &P)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
class_match< UndefValue > m_Undef()
Match an arbitrary undef constant.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
match_combine_or< CastClass_match< OpTy, Instruction::ZExt >, CastClass_match< OpTy, Instruction::SExt > > m_ZExtOrSExt(const OpTy &Op)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
initializer< Ty > init(const Ty &Val)
CodeModel::Model getCodeModel()
constexpr double e
Definition MathExtras.h:58
This class represents lattice values for constants.
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1518
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool operator==(uint64_t V1, const APInt &V2)
Definition APInt.h:2035
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:456
bool RetCC_AArch64_WebKit_JS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:177
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:497
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1332
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:603
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:486
unsigned M1(unsigned Val)
Definition VE.h:372
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1505
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:597
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned countLeadingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition MathExtras.h:226
constexpr size_t array_lengthof(T(&)[N])
Find the length of an array.
Definition STLExtras.h:1341
unsigned countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition MathExtras.h:157
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:132
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition ArrayRef.h:458
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition Error.cpp:140
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:474
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Z
zlib style complession
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition DAGCombine.h:15
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:461
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:158
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition MathExtras.h:673
@ Invalid
Denotes invalid value.
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1525
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:1667
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_WebKit_JS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:944
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
Helper structure to be able to read SetCC information.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:355
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:121
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:131
ElementCount getVectorElementCount() const
Definition ValueTypes.h:315
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:417
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:333
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:324
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:345
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:424
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:278
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:177
EVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:111
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:341
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition ValueTypes.h:398
bool isFixedLengthVector() const
Definition ValueTypes.h:156
std::string getEVTString() const
This function returns value type as a string, e.g. "i32".
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:146
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:285
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:152
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:290
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:141
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:298
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:407
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:136
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:172
Describes a register that needs to be forwarded from the prologue to a musttail call.
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits commonBits(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits common to LHS and RHS.
Definition KnownBits.h:284
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:40
Structure used to represent pair of argument number after call lowering and register used to transfer...
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:119
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowReassociation(bool b)
void setNoUnsignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
This structure contains all information that is necessary for lowering calls.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64